From 13b02cad9ae8fcc34ac0da20ba55734c09fe14eb Mon Sep 17 00:00:00 2001 From: T2T Team Date: Fri, 21 Jul 2017 12:30:48 -0700 Subject: [PATCH 01/21] Refactoring of get_or_generate_vocab* functions. PiperOrigin-RevId: 162771691 --- tensor2tensor/__init__.py | 1 + tensor2tensor/bin/t2t-datagen | 1 + tensor2tensor/bin/t2t-make-tf-configs | 1 + tensor2tensor/bin/t2t-trainer | 1 + tensor2tensor/data_generators/__init__.py | 1 + tensor2tensor/data_generators/algorithmic.py | 1 + .../data_generators/algorithmic_math.py | 1 + .../data_generators/algorithmic_math_test.py | 1 + .../data_generators/algorithmic_test.py | 1 + tensor2tensor/data_generators/all_problems.py | 1 + tensor2tensor/data_generators/audio.py | 1 + tensor2tensor/data_generators/audio_test.py | 1 + .../data_generators/concatenate_examples.py | 1 + .../data_generators/generator_utils.py | 148 +++++---- .../data_generators/generator_utils_test.py | 22 ++ tensor2tensor/data_generators/genetics.py | 1 + .../data_generators/genetics_test.py | 1 + tensor2tensor/data_generators/image.py | 1 + tensor2tensor/data_generators/image_test.py | 1 + tensor2tensor/data_generators/inspect.py | 1 + tensor2tensor/data_generators/lm1b.py | 1 + tensor2tensor/data_generators/problem.py | 5 +- .../data_generators/problem_hparams.py | 4 +- .../data_generators/problem_hparams_test.py | 1 + tensor2tensor/data_generators/ptb.py | 1 + tensor2tensor/data_generators/snli.py | 1 + tensor2tensor/data_generators/text_encoder.py | 1 + .../text_encoder_build_subword.py | 1 + tensor2tensor/data_generators/tokenizer.py | 1 + .../data_generators/tokenizer_test.py | 1 + tensor2tensor/data_generators/wiki.py | 1 + tensor2tensor/data_generators/wmt.py | 305 ++++++++---------- tensor2tensor/data_generators/wmt_test.py | 1 + tensor2tensor/data_generators/wsj_parsing.py | 1 + tensor2tensor/models/__init__.py | 1 + tensor2tensor/models/attention_lm.py | 1 + tensor2tensor/models/attention_lm_moe.py | 1 + tensor2tensor/models/bluenet.py | 1 + tensor2tensor/models/bluenet_test.py | 1 + tensor2tensor/models/bytenet.py | 1 + tensor2tensor/models/bytenet_test.py | 1 + tensor2tensor/models/common_attention.py | 4 +- tensor2tensor/models/common_hparams.py | 1 + tensor2tensor/models/common_layers.py | 1 + tensor2tensor/models/common_layers_test.py | 1 + tensor2tensor/models/long_answer.py | 1 + tensor2tensor/models/lstm.py | 1 + tensor2tensor/models/lstm_test.py | 1 + tensor2tensor/models/modalities.py | 1 + tensor2tensor/models/modalities_test.py | 1 + tensor2tensor/models/models.py | 1 + tensor2tensor/models/multimodel.py | 1 + tensor2tensor/models/multimodel_test.py | 1 + tensor2tensor/models/neural_gpu.py | 1 + tensor2tensor/models/neural_gpu_test.py | 1 + tensor2tensor/models/shake_shake.py | 1 + tensor2tensor/models/slicenet.py | 1 + tensor2tensor/models/slicenet_test.py | 1 + tensor2tensor/models/transformer.py | 1 + .../models/transformer_alternative.py | 1 + tensor2tensor/models/transformer_test.py | 1 + tensor2tensor/models/xception.py | 1 + tensor2tensor/models/xception_test.py | 1 + tensor2tensor/utils/__init__.py | 1 + tensor2tensor/utils/avg_checkpoints.py | 1 + tensor2tensor/utils/beam_search.py | 1 + tensor2tensor/utils/beam_search_test.py | 1 + tensor2tensor/utils/bleu_hook.py | 1 + tensor2tensor/utils/bleu_hook_test.py | 1 + tensor2tensor/utils/data_reader.py | 1 + tensor2tensor/utils/data_reader_test.py | 1 + tensor2tensor/utils/expert_utils.py | 1 + tensor2tensor/utils/get_ende_bleu.sh | 4 +- tensor2tensor/utils/metrics.py | 1 + tensor2tensor/utils/metrics_test.py | 1 + tensor2tensor/utils/modality.py | 1 + tensor2tensor/utils/registry.py | 1 + tensor2tensor/utils/registry_test.py | 1 + tensor2tensor/utils/t2t_model.py | 1 + tensor2tensor/utils/trainer_utils.py | 17 +- tensor2tensor/utils/trainer_utils_test.py | 1 + tensor2tensor/utils/usr_dir.py | 1 + tensor2tensor/utils/yellowfin.py | 1 + tensor2tensor/utils/yellowfin_test.py | 1 + 84 files changed, 341 insertions(+), 244 deletions(-) diff --git a/tensor2tensor/__init__.py b/tensor2tensor/__init__.py index eff6a2b14..3f714ce1f 100644 --- a/tensor2tensor/__init__.py +++ b/tensor2tensor/__init__.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index af5b47f8c..57e2b17fb 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/bin/t2t-make-tf-configs b/tensor2tensor/bin/t2t-make-tf-configs index 6a4dc8641..0b656aba6 100644 --- a/tensor2tensor/bin/t2t-make-tf-configs +++ b/tensor2tensor/bin/t2t-make-tf-configs @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer index a37767258..13dd7d355 100644 --- a/tensor2tensor/bin/t2t-trainer +++ b/tensor2tensor/bin/t2t-trainer @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/__init__.py b/tensor2tensor/data_generators/__init__.py index eff6a2b14..3f714ce1f 100644 --- a/tensor2tensor/data_generators/__init__.py +++ b/tensor2tensor/data_generators/__init__.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py index 2169e1910..676b4e45f 100644 --- a/tensor2tensor/data_generators/algorithmic.py +++ b/tensor2tensor/data_generators/algorithmic.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py index e65b47ff0..e061ceb0b 100644 --- a/tensor2tensor/data_generators/algorithmic_math.py +++ b/tensor2tensor/data_generators/algorithmic_math.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/algorithmic_math_test.py b/tensor2tensor/data_generators/algorithmic_math_test.py index 5f0de29fb..7cd67a83c 100644 --- a/tensor2tensor/data_generators/algorithmic_math_test.py +++ b/tensor2tensor/data_generators/algorithmic_math_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py index fb8ff6719..57faaa80b 100644 --- a/tensor2tensor/data_generators/algorithmic_test.py +++ b/tensor2tensor/data_generators/algorithmic_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py index 93a8a06a2..d8007f5e3 100644 --- a/tensor2tensor/data_generators/all_problems.py +++ b/tensor2tensor/data_generators/all_problems.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/audio.py b/tensor2tensor/data_generators/audio.py index 4f8c096a5..d0747a88c 100644 --- a/tensor2tensor/data_generators/audio.py +++ b/tensor2tensor/data_generators/audio.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/audio_test.py b/tensor2tensor/data_generators/audio_test.py index 1c19432c3..57e4e1ccc 100644 --- a/tensor2tensor/data_generators/audio_test.py +++ b/tensor2tensor/data_generators/audio_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py index 158bc1b59..60ac7ea8f 100644 --- a/tensor2tensor/data_generators/concatenate_examples.py +++ b/tensor2tensor/data_generators/concatenate_examples.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py index 5c0c94bce..866a0f3e7 100644 --- a/tensor2tensor/data_generators/generator_utils.py +++ b/tensor2tensor/data_generators/generator_utils.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -247,53 +248,19 @@ def gunzip_file(gz_path, new_path): ] -def get_or_generate_vocab(data_dir, tmp_dir, - vocab_filename, vocab_size, sources=None): - """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS).""" +def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size, + generator_fn): + """Inner implementation for vocab generators.""" vocab_filepath = os.path.join(data_dir, vocab_filename) if tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab - sources = sources or _DATA_FILE_URLS - tf.logging.info("Generating vocab from: %s", str(sources)) token_counts = defaultdict(int) - for source in sources: - url = source[0] - filename = os.path.basename(url) - read_type = "r:gz" if "tgz" in filename else "r" - - compressed_file = maybe_download(tmp_dir, filename, url) - - with tarfile.open(compressed_file, read_type) as corpus_tar: - corpus_tar.extractall(tmp_dir) - - for lang_file in source[1]: - tf.logging.info("Reading file: %s" % lang_file) - filepath = os.path.join(tmp_dir, lang_file) - - # For some datasets a second extraction is necessary. - if ".gz" in lang_file: - new_filepath = os.path.join(tmp_dir, lang_file[:-3]) - if tf.gfile.Exists(new_filepath): - tf.logging.info( - "Subdirectory %s already exists, skipping unpacking" % filepath) - else: - tf.logging.info("Unpacking subdirectory %s" % filepath) - gunzip_file(filepath, new_filepath) - filepath = new_filepath - - # Use Tokenizer to count the word occurrences. - with tf.gfile.GFile(filepath, mode="r") as source_file: - file_byte_budget = 3.5e5 if "en" in filepath else 7e5 - for line in source_file: - if file_byte_budget <= 0: - break - line = line.strip() - file_byte_budget -= len(line) - for tok in tokenizer.encode(text_encoder.native_to_unicode(line)): - token_counts[tok] += 1 + for item in generator_fn(): + for tok in tokenizer.encode(text_encoder.native_to_unicode(item)): + token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3) @@ -301,6 +268,55 @@ def get_or_generate_vocab(data_dir, tmp_dir, return vocab +def get_or_generate_vocab(data_dir, + tmp_dir, + vocab_filename, + vocab_size, + sources=None): + """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS).""" + sources = sources or _DATA_FILE_URLS + + def generate(): + tf.logging.info("Generating vocab from: %s", str(sources)) + for source in sources: + url = source[0] + filename = os.path.basename(url) + read_type = "r:gz" if "tgz" in filename else "r" + + compressed_file = maybe_download(tmp_dir, filename, url) + + with tarfile.open(compressed_file, read_type) as corpus_tar: + corpus_tar.extractall(tmp_dir) + + for lang_file in source[1]: + tf.logging.info("Reading file: %s" % lang_file) + filepath = os.path.join(tmp_dir, lang_file) + + # For some datasets a second extraction is necessary. + if ".gz" in lang_file: + new_filepath = os.path.join(tmp_dir, lang_file[:-3]) + if tf.gfile.Exists(new_filepath): + tf.logging.info( + "Subdirectory %s already exists, skipping unpacking" % filepath) + else: + tf.logging.info("Unpacking subdirectory %s" % filepath) + gunzip_file(filepath, new_filepath) + filepath = new_filepath + + # Use Tokenizer to count the word occurrences. + with tf.gfile.GFile(filepath, mode="r") as source_file: + file_byte_budget = 3.5e5 if "en" in filepath else 7e5 + for line in source_file: + if file_byte_budget <= 0: + break + line = line.strip() + file_byte_budget -= len(line) + yield line + + return get_or_generate_vocab_inner( + data_dir, vocab_filename, vocab_size, generator_fn=generate) + + def get_or_generate_tabbed_vocab(data_dir, tmp_dir, source_filename, index, vocab_filename, vocab_size): r"""Generate a vocabulary from a tabbed source file. @@ -320,27 +336,37 @@ def get_or_generate_tabbed_vocab(data_dir, tmp_dir, source_filename, Returns: The vocabulary. """ - vocab_filepath = os.path.join(data_dir, vocab_filename) - if os.path.exists(vocab_filepath): - vocab = text_encoder.SubwordTextEncoder(vocab_filepath) - return vocab - - # Use Tokenizer to count the word occurrences. - token_counts = defaultdict(int) - filepath = os.path.join(tmp_dir, source_filename) - with tf.gfile.GFile(filepath, mode="r") as source_file: - for line in source_file: - line = line.strip() - if line and "\t" in line: - parts = line.split("\t", maxsplit=1) - part = parts[index].strip() - for tok in tokenizer.encode(text_encoder.native_to_unicode(part)): - token_counts[tok] += 1 - - vocab = text_encoder.SubwordTextEncoder.build_to_target_size( - vocab_size, token_counts, 1, 1e3) - vocab.store_to_file(vocab_filepath) - return vocab + def generate(): + filepath = os.path.join(tmp_dir, source_filename) + tf.logging.info("Generating vocab from %s", filepath) + with tf.gfile.GFile(filepath, mode="r") as source_file: + for line in source_file: + line = line.strip() + if line and "\t" in line: + parts = line.split("\t", maxsplit=1) + part = parts[index].strip() + yield part + + return get_or_generate_vocab_inner( + data_dir, vocab_filename, vocab_size, generator_fn=generate) + + +def get_or_generate_txt_vocab(data_dir, vocab_filename, vocab_size, + filepatterns): + """Generate a vocabulary from txt files with example-per-line.""" + if isinstance(filepatterns, str): + filepatterns = [filepatterns] + + def generate(): + tf.logging.info("Generating vocab from %s", filepatterns) + for filepattern in filepatterns: + for filename in tf.gfile.Glob(filepattern): + with tf.gfile.GFile(filename, mode="r") as source_file: + for line in source_file: + yield line.strip() + + return get_or_generate_vocab_inner( + data_dir, vocab_filename, vocab_size, generator_fn=generate) def read_records(filename): diff --git a/tensor2tensor/data_generators/generator_utils_test.py b/tensor2tensor/data_generators/generator_utils_test.py index c776d120c..fd6e15ca3 100644 --- a/tensor2tensor/data_generators/generator_utils_test.py +++ b/tensor2tensor/data_generators/generator_utils_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -84,6 +85,27 @@ def testGunzipFile(self): os.remove(tmp_file_path + ".txt") os.remove(tmp_file_path) + def testGetOrGenerateTxtVocab(self): + data_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) + test_file = os.path.join(self.get_temp_dir(), "test.txt") + with tf.gfile.Open(test_file, "w") as outfile: + outfile.write("a b c\n") + outfile.write("d e f\n") + # Create a vocab over the test file. + vocab1 = generator_utils.get_or_generate_txt_vocab( + data_dir, "test.voc", 20, test_file) + self.assertTrue(tf.gfile.Exists(os.path.join(data_dir, "test.voc"))) + self.assertIsNotNone(vocab1) + + # Append a new line to the test file which would change the vocab if + # the vocab were not being read from file. + with tf.gfile.Open(test_file, "a") as outfile: + outfile.write("g h i\n") + vocab2 = generator_utils.get_or_generate_txt_vocab( + data_dir, "test.voc", 20, test_file) + self.assertTrue(tf.gfile.Exists(os.path.join(data_dir, "test.voc"))) + self.assertIsNotNone(vocab2) + self.assertEqual(vocab1.dump(), vocab2.dump()) if __name__ == "__main__": tf.test.main() diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py index 255e0caf9..b4ad36544 100644 --- a/tensor2tensor/data_generators/genetics.py +++ b/tensor2tensor/data_generators/genetics.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/genetics_test.py b/tensor2tensor/data_generators/genetics_test.py index 70b4fe495..85d70f934 100644 --- a/tensor2tensor/data_generators/genetics_test.py +++ b/tensor2tensor/data_generators/genetics_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index e3567d78f..f8e3191a2 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/image_test.py b/tensor2tensor/data_generators/image_test.py index 6c9984265..59cad4226 100644 --- a/tensor2tensor/data_generators/image_test.py +++ b/tensor2tensor/data_generators/image_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/inspect.py b/tensor2tensor/data_generators/inspect.py index dad0c1c83..124c07017 100644 --- a/tensor2tensor/data_generators/inspect.py +++ b/tensor2tensor/data_generators/inspect.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py index 78fb001bc..562435184 100644 --- a/tensor2tensor/data_generators/lm1b.py +++ b/tensor2tensor/data_generators/lm1b.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 5beb0385f..690f14277 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -69,10 +70,6 @@ class SpaceID(object): ICE_PARSE_TOK = 19 # Macedonian tokens MK_TOK = 20 - # Czech tokens - CS_TOK = 21 - # Czech characters - CS_CHR = 22 class Problem(object): diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index 51bc0ba62..4343afd27 100644 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -180,9 +181,6 @@ def default_problem_hparams(): # 17: Icelandic characters # 18: Icelandic tokens # 19: Icelandic parse tokens - # 20: Macedonian tokens - # 21: Czech tokens - # 22: Czech characters # Add more above if needed. input_space_id=0, target_space_id=0, diff --git a/tensor2tensor/data_generators/problem_hparams_test.py b/tensor2tensor/data_generators/problem_hparams_test.py index ad1f0192d..df92919ef 100644 --- a/tensor2tensor/data_generators/problem_hparams_test.py +++ b/tensor2tensor/data_generators/problem_hparams_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py index 9a7db3a78..f71f0d902 100644 --- a/tensor2tensor/data_generators/ptb.py +++ b/tensor2tensor/data_generators/ptb.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/snli.py b/tensor2tensor/data_generators/snli.py index 7322c59ff..cd4ff723d 100644 --- a/tensor2tensor/data_generators/snli.py +++ b/tensor2tensor/data_generators/snli.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 8be22ce0b..7c53784f3 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py index 093101c68..a0d5d8937 100644 --- a/tensor2tensor/data_generators/text_encoder_build_subword.py +++ b/tensor2tensor/data_generators/text_encoder_build_subword.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py index 2b1cf572c..d1faaa7b3 100644 --- a/tensor2tensor/data_generators/tokenizer.py +++ b/tensor2tensor/data_generators/tokenizer.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py index c279290ed..189f19663 100644 --- a/tensor2tensor/data_generators/tokenizer_test.py +++ b/tensor2tensor/data_generators/tokenizer_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py index 8f905aa96..49147962a 100644 --- a/tensor2tensor/data_generators/wiki.py +++ b/tensor2tensor/data_generators/wiki.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 456970e62..bb31d0c0f 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -42,6 +43,23 @@ EOS = text_encoder.EOS_ID +def _default_token_feature_encoders(data_dir, target_vocab_size): + vocab_filename = os.path.join(data_dir, + "vocab.endefr.%d" % target_vocab_size) + subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) + return { + "inputs": subtokenizer, + "targets": subtokenizer, + } + + +def _default_character_feature_encoders(): + return { + "inputs": text_encoder.ByteTextEncoder(), + "targets": text_encoder.ByteTextEncoder(), + } + + class WMTProblem(problem.Problem): """Base class for WMT problems.""" @@ -53,13 +71,14 @@ def is_character_level(self): def targeted_vocab_size(self): raise NotImplementedError() # Not needed if self.is_character_level. - def train_generator(self, data_dir, tmp_dir, is_training): - """Generator of the training data.""" + @property + def train_generator(self): + """Generator; takes data_dir, tmp_dir, is_training, targeted_vocab_size.""" raise NotImplementedError() - def dev_generator(self, data_dir, tmp_dir, is_training): - """Generator of the development data.""" - return self.train_generator(data_dir, tmp_dir, is_training) + @property + def dev_generator(self): + return self.train_generator @property def input_space_id(self): @@ -73,35 +92,28 @@ def target_space_id(self): def num_shards(self): return 100 - @property - def vocab_name(self): - return "vocab.endefr" - - @property - def vocab_file(self): - return "%s.%d" % (self.vocab_name, self.targeted_vocab_size) - def generate_data(self, data_dir, tmp_dir, num_shards=None): if num_shards is None: num_shards = self.num_shards - generator_utils.generate_dataset_and_shuffle( - self.train_generator(data_dir, tmp_dir, True), - self.training_filepaths(data_dir, num_shards, shuffled=False), - self.dev_generator(data_dir, tmp_dir, False), - self.dev_filepaths(data_dir, 1, shuffled=False)) + if self.is_character_level: + generator_utils.generate_dataset_and_shuffle( + self.train_generator(tmp_dir, True), + self.training_filepaths(data_dir, num_shards, shuffled=False), + self.dev_generator(tmp_dir, False), + self.dev_filepaths(data_dir, 1, shuffled=False)) + else: + generator_utils.generate_dataset_and_shuffle( + self.train_generator(data_dir, tmp_dir, True, + self.targeted_vocab_size), + self.training_filepaths(data_dir, num_shards, shuffled=False), + self.dev_generator(data_dir, tmp_dir, False, + self.targeted_vocab_size), + self.dev_filepaths(data_dir, 1, shuffled=False)) def feature_encoders(self, data_dir): if self.is_character_level: - return { - "inputs": text_encoder.ByteTextEncoder(), - "targets": text_encoder.ByteTextEncoder(), - } - vocab_filename = os.path.join(data_dir, self.vocab_file) - subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) - return { - "inputs": subtokenizer, - "targets": subtokenizer, - } + return _default_character_feature_encoders() + return _default_token_feature_encoders(data_dir, self.targeted_vocab_size) def hparams(self, defaults, unused_model_hparams): p = defaults @@ -163,8 +175,8 @@ def tabbed_generator(source_path, source_vocab, target_vocab, eos=None): Args: source_path: path to the file with source and target sentences. - source_vocab: a SubwordTextEncoder to encode the source string. - target_vocab: a SubwordTextEncoder to encode the target string. + source_vocab: a SunwordTextEncoder to encode the source string. + target_vocab: a SunwordTextEncoder to encode the target string. eos: integer to append at the end of each sequence (default: None). Yields: @@ -325,29 +337,6 @@ def bi_vocabs_token_generator(source_path, ("dev.mk", "dev.en") ]] -# English-Czech datasets -_ENCS_TRAIN_DATASETS = [ - [ - "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long - ("training-parallel-nc-v11/news-commentary-v11.cs-en.en", - "training-parallel-nc-v11/news-commentary-v11.cs-en.cs") - ], - [ - "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", - ("commoncrawl.cs-en.en", "commoncrawl.cs-en.cs") - ], - [ - "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", - ("training/europarl-v7.cs-en.en", "training/europarl-v7.cs-en.cs") - ], -] -_ENCS_TEST_DATASETS = [ - [ - "http://data.statmt.org/wmt16/translation-task/dev.tgz", - ("dev/newstest2013.en", "dev/newstest2013.cs") - ], -] - # Generators. @@ -419,6 +408,16 @@ def _compile_data(tmp_dir, datasets, filename): return filename +def ende_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size): + symbolizer_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size) + datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, EOS) + + @registry.register_problem("wmt_ende_tokens_8k") class WMTEnDeTokens8k(WMTProblem): """Problem spec for WMT En-De translation.""" @@ -427,13 +426,9 @@ class WMTEnDeTokens8k(WMTProblem): def targeted_vocab_size(self): return 2**13 # 8192 - def train_generator(self, data_dir, tmp_dir, train): - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size) - datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) - return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS) + @property + def train_generator(self): + return ende_wordpiece_token_generator @property def input_space_id(self): @@ -452,6 +447,15 @@ def targeted_vocab_size(self): return 2**15 # 32768 +def ende_character_generator(tmp_dir, train): + character_vocab = text_encoder.ByteTextEncoder() + datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag) + return character_generator(data_path + ".lang1", data_path + ".lang2", + character_vocab, EOS) + + @registry.register_problem("wmt_ende_characters") class WMTEnDeCharacters(WMTProblem): """Problem spec for WMT En-De translation.""" @@ -460,13 +464,9 @@ class WMTEnDeCharacters(WMTProblem): def is_character_level(self): return True - def train_generator(self, tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() - datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag) - return character_generator(data_path + ".lang1", data_path + ".lang2", - character_vocab, EOS) + @property + def train_generator(self): + return ende_character_generator @property def input_space_id(self): @@ -477,6 +477,29 @@ def target_space_id(self): return problem.SpaceID.DE_CHR +def zhen_wordpiece_token_bigenerator(data_dir, tmp_dir, train, + source_vocab_size, target_vocab_size): + """Wordpiece generator for the WMT'17 zh-en dataset.""" + datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS + source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS] + target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS] + source_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, "vocab.zh.%d" % source_vocab_size, + source_vocab_size, source_datasets) + target_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, "vocab.en.%d" % target_vocab_size, + target_vocab_size, target_datasets) + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) + return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", + source_vocab, target_vocab, EOS) + + +def zhen_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size): + return zhen_wordpiece_token_bigenerator(data_dir, tmp_dir, train, + vocab_size, vocab_size) + + @registry.register_problem("wmt_zhen_tokens_8k") class WMTZhEnTokens8k(WMTProblem): """Problem spec for WMT Zh-En translation.""" @@ -485,22 +508,9 @@ class WMTZhEnTokens8k(WMTProblem): def targeted_vocab_size(self): return 2**13 # 8192 - def train_generator(self, data_dir, tmp_dir, train): - source_vocab_size = self.targeted_vocab_size - target_vocab_size = self.targeted_vocab_size - datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS - source_datasets = [[item[0], [item[1][0]]] for item in datasets] - target_datasets = [[item[0], [item[1][1]]] for item in datasets] - source_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, "vocab.zh.%d" % source_vocab_size, source_vocab_size, - source_datasets) - target_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, "vocab.en.%d" % target_vocab_size, target_vocab_size, - target_datasets) - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) - return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", - source_vocab, target_vocab, EOS) + @property + def train_generator(self): + return zhen_wordpiece_token_generator @property def input_space_id(self): @@ -532,6 +542,17 @@ def targeted_vocab_size(self): return 2**15 # 32768 +def enfr_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size): + """Instance of token generator for the WMT en->fr task.""" + symbolizer_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size) + datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, EOS) + + @registry.register_problem("wmt_enfr_tokens_8k") class WMTEnFrTokens8k(WMTProblem): """Problem spec for WMT En-Fr translation.""" @@ -540,13 +561,9 @@ class WMTEnFrTokens8k(WMTProblem): def targeted_vocab_size(self): return 2**13 # 8192 - def train_generator(self, tmp_dir, train): - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size) - datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) - return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS) + @property + def train_generator(self): + return enfr_wordpiece_token_generator @property def input_space_id(self): @@ -565,6 +582,16 @@ def targeted_vocab_size(self): return 2**15 # 32768 +def enfr_character_generator(tmp_dir, train): + """Instance of character generator for the WMT en->fr task.""" + character_vocab = text_encoder.ByteTextEncoder() + datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag) + return character_generator(data_path + ".lang1", data_path + ".lang2", + character_vocab, EOS) + + @registry.register_problem("wmt_enfr_characters") class WMTEnFrCharacters(WMTProblem): """Problem spec for WMT En-Fr translation.""" @@ -573,13 +600,9 @@ class WMTEnFrCharacters(WMTProblem): def is_character_level(self): return True - def train_generator(self, data_dir, tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() - datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag) - return character_generator(data_path + ".lang1", data_path + ".lang2", - character_vocab, EOS) + @property + def train_generator(self): + return enfr_character_generator @property def input_space_id(self): @@ -590,6 +613,20 @@ def target_space_id(self): return problem.SpaceID.FR_CHR +def mken_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size): + """Wordpiece generator for the SETimes Mk-En dataset.""" + datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS + source_datasets = [[item[0], [item[1][0]]] for item in _MKEN_TRAIN_DATASETS] + target_datasets = [[item[0], [item[1][1]]] for item in _MKEN_TRAIN_DATASETS] + symbolizer_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, "vocab.mken.%d" % vocab_size, vocab_size, + source_datasets + target_datasets) + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, EOS) + + @registry.register_problem("setimes_mken_tokens_32k") class SETimesMkEnTokens32k(WMTProblem): """Problem spec for SETimes Mk-En translation.""" @@ -599,20 +636,8 @@ def targeted_vocab_size(self): return 2**15 # 32768 @property - def vocab_name(self): - return "vocab.mken" - - def train_generator(self, data_dir, tmp_dir, train): - datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS - source_datasets = [[item[0], [item[1][0]]] for item in datasets] - target_datasets = [[item[0], [item[1][1]]] for item in datasets] - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, - source_datasets + target_datasets) - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) - return token_generator(data_path + ".lang1", data_path + ".lang2", - symbolizer_vocab, EOS) + def train_generator(self): + return mken_wordpiece_token_generator @property def input_space_id(self): @@ -622,65 +647,7 @@ def input_space_id(self): def target_space_id(self): return problem.SpaceID.EN_TOK -@registry.register_problem("wmt_encs_tokens_32k") -class WMTEnCsTokens32k(problem.Problem): - """Problem spec for WMT English-Czech translation.""" - - @property - def target_vocab_size(self): - return 2**15 # 32768 - - @property - def vocab_name(self): - return "vocab.encs" - - def train_generator(self, data_dir, tmp_dir, train): - datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS - source_datasets = [[item[0], [item[1][0]]] for item in datasets] - target_datasets = [[item[0], [item[1][1]]] for item in datasets] - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, - source_datasets + target_datasets) - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_encs_tok_%s" % tag) - return token_generator(data_path + ".lang1", data_path + ".lang2", - symbolizer_vocab, EOS) - - @property - def input_space_id(self): - return problem.SpaceID.EN_TOK - - @property - def target_space_id(self): - return problem.SpaceID.CS_TOK - - -@registry.register_problem("wmt_encs_characters") -class WMTEnCsCharacters(WMTProblem): - """Problem spec for WMT En-Cs character-based translation.""" - - @property - def is_character_level(self): - return True - - def train_generator(self, data_dir, tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() - datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_encs_chr_%s" % tag) - return character_generator(data_path + ".lang1", data_path + ".lang2", - character_vocab, EOS) - - @property - def input_space_id(self): - return problem.SpaceID.EN_CHR - - @property - def target_space_id(self): - return problem.SpaceID.CS_CHR - -# TODO This function is not used anywhere. def parsing_character_generator(tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() filename = "parsing_%s" % ("train" if train else "dev") diff --git a/tensor2tensor/data_generators/wmt_test.py b/tensor2tensor/data_generators/wmt_test.py index 86b88e5b1..441ceef59 100644 --- a/tensor2tensor/data_generators/wmt_test.py +++ b/tensor2tensor/data_generators/wmt_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/wsj_parsing.py b/tensor2tensor/data_generators/wsj_parsing.py index 200754e16..4b1dbdd80 100644 --- a/tensor2tensor/data_generators/wsj_parsing.py +++ b/tensor2tensor/data_generators/wsj_parsing.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py index eff6a2b14..3f714ce1f 100644 --- a/tensor2tensor/models/__init__.py +++ b/tensor2tensor/models/__init__.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py index 752de038e..3b874555f 100644 --- a/tensor2tensor/models/attention_lm.py +++ b/tensor2tensor/models/attention_lm.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index 2754e8366..4b37050bb 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/bluenet.py b/tensor2tensor/models/bluenet.py index 95216f43d..3ac477e4b 100644 --- a/tensor2tensor/models/bluenet.py +++ b/tensor2tensor/models/bluenet.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/bluenet_test.py b/tensor2tensor/models/bluenet_test.py index b3f18249d..d4ce85b1a 100644 --- a/tensor2tensor/models/bluenet_test.py +++ b/tensor2tensor/models/bluenet_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py index 301626dc2..28862e594 100644 --- a/tensor2tensor/models/bytenet.py +++ b/tensor2tensor/models/bytenet.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py index f1e42669e..738b84251 100644 --- a/tensor2tensor/models/bytenet_test.py +++ b/tensor2tensor/models/bytenet_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py index c8b4a6068..4f694a7f9 100644 --- a/tensor2tensor/models/common_attention.py +++ b/tensor2tensor/models/common_attention.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -65,9 +66,6 @@ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) - signal = tf.reshape(signal, [length, 2, num_timescales]) - signal = tf.transpose(signal, perm=[0, 2, 1]) - signal = tf.reshape(signal, [length, channels]) signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]]) signal = tf.reshape(signal, [1, length, channels]) return x + signal diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py index ff856968b..a86974d1f 100644 --- a/tensor2tensor/models/common_hparams.py +++ b/tensor2tensor/models/common_hparams.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py index 638535aa2..11b6396a8 100644 --- a/tensor2tensor/models/common_layers.py +++ b/tensor2tensor/models/common_layers.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/common_layers_test.py b/tensor2tensor/models/common_layers_test.py index 3a2fafd8b..8e724587b 100644 --- a/tensor2tensor/models/common_layers_test.py +++ b/tensor2tensor/models/common_layers_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/long_answer.py b/tensor2tensor/models/long_answer.py index 7bb6a4a55..be8024f63 100644 --- a/tensor2tensor/models/long_answer.py +++ b/tensor2tensor/models/long_answer.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py index c3ae0a01e..ae221bdff 100644 --- a/tensor2tensor/models/lstm.py +++ b/tensor2tensor/models/lstm.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py index 4ddaf6b64..1e542a666 100644 --- a/tensor2tensor/models/lstm_test.py +++ b/tensor2tensor/models/lstm_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py index 60df80a1c..9a6115558 100644 --- a/tensor2tensor/models/modalities.py +++ b/tensor2tensor/models/modalities.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/modalities_test.py b/tensor2tensor/models/modalities_test.py index 118db3847..4254c6b04 100644 --- a/tensor2tensor/models/modalities_test.py +++ b/tensor2tensor/models/modalities_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py index 2cf639426..e92ddd3ed 100644 --- a/tensor2tensor/models/models.py +++ b/tensor2tensor/models/models.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py index bf06dfd65..089889ce6 100644 --- a/tensor2tensor/models/multimodel.py +++ b/tensor2tensor/models/multimodel.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py index 958fac5d7..03990594b 100644 --- a/tensor2tensor/models/multimodel_test.py +++ b/tensor2tensor/models/multimodel_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py index 30d535098..fc9d75639 100644 --- a/tensor2tensor/models/neural_gpu.py +++ b/tensor2tensor/models/neural_gpu.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py index 1dddc1056..3d1cc0562 100644 --- a/tensor2tensor/models/neural_gpu_test.py +++ b/tensor2tensor/models/neural_gpu_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/shake_shake.py b/tensor2tensor/models/shake_shake.py index 26d43afb3..7fa40783a 100644 --- a/tensor2tensor/models/shake_shake.py +++ b/tensor2tensor/models/shake_shake.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py index 2ad4c89d1..69e2338b6 100644 --- a/tensor2tensor/models/slicenet.py +++ b/tensor2tensor/models/slicenet.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py index 911953445..692799571 100644 --- a/tensor2tensor/models/slicenet_test.py +++ b/tensor2tensor/models/slicenet_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index c693d1ca3..23197fcd9 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/transformer_alternative.py b/tensor2tensor/models/transformer_alternative.py index 280dbc713..62413c325 100644 --- a/tensor2tensor/models/transformer_alternative.py +++ b/tensor2tensor/models/transformer_alternative.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py index 997b5d172..a7f1fc9ae 100644 --- a/tensor2tensor/models/transformer_test.py +++ b/tensor2tensor/models/transformer_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py index d3c5a2690..61fa61235 100644 --- a/tensor2tensor/models/xception.py +++ b/tensor2tensor/models/xception.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py index aa5c1c034..bf434aeac 100644 --- a/tensor2tensor/models/xception_test.py +++ b/tensor2tensor/models/xception_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/__init__.py b/tensor2tensor/utils/__init__.py index eff6a2b14..3f714ce1f 100644 --- a/tensor2tensor/utils/__init__.py +++ b/tensor2tensor/utils/__init__.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/avg_checkpoints.py b/tensor2tensor/utils/avg_checkpoints.py index a84750310..77acd4353 100644 --- a/tensor2tensor/utils/avg_checkpoints.py +++ b/tensor2tensor/utils/avg_checkpoints.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py index 3a511907d..dd8275204 100644 --- a/tensor2tensor/utils/beam_search.py +++ b/tensor2tensor/utils/beam_search.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/beam_search_test.py b/tensor2tensor/utils/beam_search_test.py index e084f1f0e..5223989ea 100644 --- a/tensor2tensor/utils/beam_search_test.py +++ b/tensor2tensor/utils/beam_search_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py index 155b10c72..06d62ad1e 100644 --- a/tensor2tensor/utils/bleu_hook.py +++ b/tensor2tensor/utils/bleu_hook.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/bleu_hook_test.py b/tensor2tensor/utils/bleu_hook_test.py index 8092ab979..bf08174f8 100644 --- a/tensor2tensor/utils/bleu_hook_test.py +++ b/tensor2tensor/utils/bleu_hook_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index cd8e6c2d3..d7af960ab 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py index 18507ed06..f0c318e7b 100644 --- a/tensor2tensor/utils/data_reader_test.py +++ b/tensor2tensor/utils/data_reader_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py index c3becbfb4..e21f2453a 100644 --- a/tensor2tensor/utils/expert_utils.py +++ b/tensor2tensor/utils/expert_utils.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/get_ende_bleu.sh b/tensor2tensor/utils/get_ende_bleu.sh index 3493af74c..09078414f 100755 --- a/tensor2tensor/utils/get_ende_bleu.sh +++ b/tensor2tensor/utils/get_ende_bleu.sh @@ -5,8 +5,10 @@ tok_gold_targets=newstest2013.tok.de decodes_file=$1 +cut -d' ' -f1 $decodes_file > $decodes_file.target + # Tokenize. -perl $mosesdecoder/scripts/tokenizer/tokenizer.perl -l de < $decodes_file > $decodes_file.tok +perl $mosesdecoder/scripts/tokenizer/tokenizer.perl -l de < $decodes_file.target > $decodes_file.tok # Put compounds in ATAT format (comparable to papers like GNMT, ConvS2S). # See https://nlp.stanford.edu/projects/nmt/ : diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py index cf66f6af8..118e33394 100644 --- a/tensor2tensor/utils/metrics.py +++ b/tensor2tensor/utils/metrics.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/metrics_test.py b/tensor2tensor/utils/metrics_test.py index de72d797f..0d78e632c 100644 --- a/tensor2tensor/utils/metrics_test.py +++ b/tensor2tensor/utils/metrics_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py index 3ac6153b7..a42f35c24 100644 --- a/tensor2tensor/utils/modality.py +++ b/tensor2tensor/utils/modality.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py index 5a8823510..0baad2471 100644 --- a/tensor2tensor/utils/registry.py +++ b/tensor2tensor/utils/registry.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/registry_test.py b/tensor2tensor/utils/registry_test.py index 1f4436b0c..3231809ea 100644 --- a/tensor2tensor/utils/registry_test.py +++ b/tensor2tensor/utils/registry_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 2a271afbf..9777568fc 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 878dbe107..96c43a5a0 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -121,6 +122,8 @@ "Whether to return 1 (False) or all (True) beams. The \n " "output file will have the format " "\t..\t") +flags.DEFINE_integer("decode_max_input_size", -1, + "Maximum number of ids in input. Or <= 0 for no max.") def _save_until_eos(hyp): @@ -693,17 +696,22 @@ def log_fn(inputs, outputs): decodes.reverse() # Dumping inputs and outputs to file filename.decodes in # format result\tinput in the same order as original inputs + if FLAGS.decode_to_file: + output_filename = FLAGS.decode_to_file + else: + output_filename = filename if FLAGS.decode_shards > 1: - base_filename = filename + ("%.2d" % FLAGS.worker_id) + base_filename = output_filename + ("%.2d" % FLAGS.worker_id) else: - base_filename = filename + base_filename = output_filename decode_filename = (base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set + ".beam" + str(FLAGS.decode_beam_size) + ".alpha" + str(FLAGS.decode_alpha) + ".decodes") tf.logging.info("Writing decodes into %s" % decode_filename) outfile = tf.gfile.Open(decode_filename, "w") for index in range(len(sorted_inputs)): - outfile.write("%s\n" % (decodes[sorted_keys[index]])) + outfile.write("%s\t%s\n" % (decodes[sorted_keys[index]], + sorted_inputs[sorted_keys[index]])) def decode_interactively(estimator): @@ -744,6 +752,9 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, for inputs in sorted_inputs[b * FLAGS.decode_batch_size: (b + 1) * FLAGS.decode_batch_size]: input_ids = vocabulary.encode(inputs) + if FLAGS.decode_max_input_size > 0: + # Subtract 1 for the EOS_ID. + input_ids = input_ids[:FLAGS.decode_max_input_size - 1] input_ids.append(text_encoder.EOS_ID) batch_inputs.append(input_ids) if len(input_ids) > batch_length: diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py index 3ed86952b..ea88183c9 100644 --- a/tensor2tensor/utils/trainer_utils_test.py +++ b/tensor2tensor/utils/trainer_utils_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/usr_dir.py b/tensor2tensor/utils/usr_dir.py index 0a2d0d15c..d89745b98 100644 --- a/tensor2tensor/utils/usr_dir.py +++ b/tensor2tensor/utils/usr_dir.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/yellowfin.py b/tensor2tensor/utils/yellowfin.py index 6bbe31bf6..aeb14e76e 100644 --- a/tensor2tensor/utils/yellowfin.py +++ b/tensor2tensor/utils/yellowfin.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/yellowfin_test.py b/tensor2tensor/utils/yellowfin_test.py index c4727175b..2130be2b3 100644 --- a/tensor2tensor/utils/yellowfin_test.py +++ b/tensor2tensor/utils/yellowfin_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); From 8f624dbda8d78d0331b5cc7465cc1f39bf259de1 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Fri, 21 Jul 2017 15:15:04 -0700 Subject: [PATCH 02/21] Don't repeatedly concatenate strings in a loop. PiperOrigin-RevId: 162791277 --- tensor2tensor/data_generators/tokenizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py index d1faaa7b3..0f4141199 100644 --- a/tensor2tensor/data_generators/tokenizer.py +++ b/tensor2tensor/data_generators/tokenizer.py @@ -101,13 +101,13 @@ def decode(tokens): Returns: a unicode string """ - ret = u"" token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens] + ret = [] for i, token in enumerate(tokens): if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]: - ret += u" " - ret += token - return ret + ret.append(u" ") + ret.append(token) + return "".join(ret) def corpus_token_counts(text_filepattern, corpus_max_lines, From e43ce968f9ce9f06dc5bb83cc0bb57af848fe3ac Mon Sep 17 00:00:00 2001 From: T2T Team Date: Mon, 24 Jul 2017 09:27:21 -0700 Subject: [PATCH 03/21] Set `allow_defun` to False, allowing export to tf.SavedModel PiperOrigin-RevId: 162946551 --- tensor2tensor/models/common_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py index 11b6396a8..37e791bc3 100644 --- a/tensor2tensor/models/common_layers.py +++ b/tensor2tensor/models/common_layers.py @@ -31,7 +31,7 @@ from tensorflow.python.framework import function # This is a global setting. When turned off, no @function.Defun is used. -allow_defun = True +allow_defun = False def saturating_sigmoid(x): From c422b989ba9963b2900b53aa5d8de8d5505ddc01 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Mon, 24 Jul 2017 09:27:44 -0700 Subject: [PATCH 04/21] Add task_id to Problem for possibly distributed data gen PiperOrigin-RevId: 162946584 --- tensor2tensor/bin/t2t-datagen | 5 ++++- tensor2tensor/data_generators/problem.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index 57e2b17fb..ecb5175e6 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -67,6 +67,7 @@ flags.DEFINE_integer("num_shards", 10, "How many shards to use.") flags.DEFINE_integer("max_cases", 0, "Maximum number of cases to generate (unbounded if 0).") flags.DEFINE_integer("random_seed", 429459, "Random seed to use.") +flags.DEFINE_integer("task_id", -1, "For distributed data generation.") flags.DEFINE_string("t2t_usr_dir", "", "Path to a Python module that will be imported. The " "__init__.py file should include the necessary imports. " @@ -277,9 +278,11 @@ def generate_data_for_problem(problem): def generate_data_for_registered_problem(problem_name): problem = registry.problem(problem_name) + task_id = None if FLAGS.task_id < 0 else FLAGS.task_id problem.generate_data(os.path.expanduser(FLAGS.data_dir), os.path.expanduser(FLAGS.tmp_dir), - FLAGS.num_shards) + FLAGS.num_shards, + task_id=task_id) if __name__ == "__main__": diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 690f14277..99f8e97de 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -115,7 +115,7 @@ class Problem(object): # BEGIN SUBCLASS INTERFACE # ============================================================================ - def generate_data(self, data_dir, tmp_dir, num_shards=None): + def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): raise NotImplementedError() def hparams(self, defaults, model_hparams): From 7a3c35dabaedbca620e5b7915903076ae93e03a7 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Mon, 24 Jul 2017 18:03:03 -0700 Subject: [PATCH 05/21] GeneExpression Problem, RealModality, and Problem.preprocessing PiperOrigin-RevId: 163016460 --- tensor2tensor/bin/t2t-datagen | 2 +- tensor2tensor/data_generators/algorithmic.py | 2 +- tensor2tensor/data_generators/genetics.py | 171 +++++++++++++++--- .../data_generators/genetics_test.py | 19 +- tensor2tensor/data_generators/image.py | 32 +++- tensor2tensor/data_generators/problem.py | 27 +++ tensor2tensor/data_generators/wmt.py | 2 +- tensor2tensor/models/common_hparams.py | 2 + tensor2tensor/models/modalities.py | 38 +++- tensor2tensor/models/transformer.py | 4 +- tensor2tensor/utils/data_reader.py | 76 ++++++-- tensor2tensor/utils/modality.py | 2 +- tensor2tensor/utils/trainer_utils.py | 72 +++++--- 13 files changed, 363 insertions(+), 86 deletions(-) diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index ecb5175e6..783906d95 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -281,7 +281,7 @@ def generate_data_for_registered_problem(problem_name): task_id = None if FLAGS.task_id < 0 else FLAGS.task_id problem.generate_data(os.path.expanduser(FLAGS.data_dir), os.path.expanduser(FLAGS.tmp_dir), - FLAGS.num_shards, + num_shards=FLAGS.num_shards, task_id=task_id) diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py index 676b4e45f..017bc8470 100644 --- a/tensor2tensor/data_generators/algorithmic.py +++ b/tensor2tensor/data_generators/algorithmic.py @@ -66,7 +66,7 @@ def dev_size(self): def num_shards(self): return 10 - def generate_data(self, data_dir, _, num_shards=None): + def generate_data(self, data_dir, _, num_shards=None, task_id=-1): if num_shards is None: num_shards = self.num_shards diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py index b4ad36544..848c2341b 100644 --- a/tensor2tensor/data_generators/genetics.py +++ b/tensor2tensor/data_generators/genetics.py @@ -35,6 +35,7 @@ from __future__ import division from __future__ import print_function +import itertools import multiprocessing as mp import os @@ -50,19 +51,13 @@ from tensor2tensor.data_generators import text_encoder from tensor2tensor.utils import registry -_bases = list("ACTG") -BASE_TO_ID = dict(zip(_bases, range(len(_bases)))) -ID_TO_BASE = dict(zip(range(len(_bases)), _bases)) -UNK_ID = len(_bases) - +import tensorflow as tf -# TODO(rsepassi): -# * DataEncoder for genetic bases -# * GeneticModality and problem hparams -# * Training preprocessing +_bases = list("ACTG") -class GeneticsProblem(problem.Problem): +class GeneExpressionProblem(problem.Problem): + """Base Problem for gene expression datasets.""" @property def download_url(self): @@ -72,13 +67,35 @@ def download_url(self): def h5_file(self): raise NotImplementedError() - def generate_data(self, data_dir, tmp_dir, num_shards=None): + @property + def num_output_predictions(self): + """Number of float predictions per timestep.""" + return 10 + + @property + def chunk_size(self): + return 4 + + def feature_encoders(self, data_dir): + del data_dir + return { + "inputs": GeneticBaseEncoder(chunk_size=self.chunk_size), + # TODO(rsepassi): RealEncoder? + "targets": text_encoder.TextEncoder() + } + + def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): if num_shards is None: num_shards = 100 - # Download source data - h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file, - self.download_url) + try: + # Download source data if download_url specified + h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file, + self.download_url) + except NotImplementedError: + # Otherwise, look for it locally + h5_filepath = os.path.join(tmp_dir, self.h5_file) + with h5py.File(h5_filepath, "r") as h5_file: num_train_examples = h5_file["train_in"].len() num_dev_examples = h5_file["valid_in"].len() @@ -100,7 +117,8 @@ def generate_data(self, data_dir, tmp_dir, num_shards=None): outfiles, num_examples): p = mp.Process( target=generate_dataset, - args=(h5_filepath, key_prefix, [outfile], start_idx, end_idx)) + args=(h5_filepath, key_prefix, [outfile], self.chunk_size, + start_idx, end_idx)) processes.append(p) # Start and wait for processes @@ -113,9 +131,36 @@ def generate_data(self, data_dir, tmp_dir, num_shards=None): # Shuffle generator_utils.shuffle_dataset(all_filepaths) + def hparams(self, defaults, model_hparams): + p = defaults + vocab_size = self._encoders["inputs"].vocab_size + p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)} + p.target_modality = ("%s:real" % registry.Modalities.GENERIC, + self.num_output_predictions) + p.input_space_id = problem.SpaceID.DNA + p.target_space_id = problem.SpaceID.REAL + + def example_reading_spec(self): + # TODO(rsepassi): propagate and apply targets_mask to output RealModality. + data_fields = { + "inputs": tf.VarLenFeature(tf.int64), + "targets_mask": tf.VarLenFeature(tf.float32), + "targets": tf.VarLenFeature(tf.float32), + } + data_items_to_decoders = None + return (data_fields, data_items_to_decoders) + + def preprocess_examples(self, examples, mode): + del mode + + examples["targets"] = tf.reshape(examples["targets"], + [-1, 1, self.num_output_predictions]) + + return examples + @registry.register_problem("genetics_cage10") -class GeneticsCAGE10(GeneticsProblem): +class GeneticsCAGE10(GeneExpressionProblem): @property def download_url(self): @@ -127,7 +172,7 @@ def h5_file(self): @registry.register_problem("genetics_gm12878") -class GeneticsGM12878(GeneticsProblem): +class GeneticsGM12878(GeneExpressionProblem): @property def download_url(self): @@ -138,6 +183,14 @@ def h5_file(self): return "gm12878.h5" +@registry.register_problem("genetics_l262k") +class GeneticsL262k(GeneExpressionProblem): + + @property + def h5_file(self): + return "l262k_w128.h5" + + def generate_shard_args(outfiles, num_examples): """Generate start and end indices per outfile.""" num_shards = len(outfiles) @@ -152,16 +205,22 @@ def generate_shard_args(outfiles, num_examples): def generate_dataset(h5_filepath, key_prefix, out_filepaths, + chunk_size=1, start_idx=None, end_idx=None): print("PID: %d, Key: %s, (Start, End): (%s, %s)" % (os.getpid(), key_prefix, start_idx, end_idx)) generator_utils.generate_files( - dataset_generator(h5_filepath, key_prefix, start_idx, end_idx), - out_filepaths) + dataset_generator(h5_filepath, key_prefix, chunk_size, start_idx, + end_idx), out_filepaths) -def dataset_generator(filepath, dataset, start_idx=None, end_idx=None): +def dataset_generator(filepath, + dataset, + chunk_size=1, + start_idx=None, + end_idx=None): + encoder = GeneticBaseEncoder(chunk_size=chunk_size) with h5py.File(filepath, "r") as h5_file: # Get input keys from h5_file src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]] @@ -178,12 +237,13 @@ def dataset_generator(filepath, dataset, start_idx=None, end_idx=None): if i % 100 == 0: print("Generating example %d for %s" % (i, dataset)) inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i] - yield to_example_dict(inputs, mask, outputs) + yield to_example_dict(encoder, inputs, mask, outputs) -def to_example_dict(inputs, mask, outputs): +def to_example_dict(encoder, inputs, mask, outputs): """Convert single h5 record to an example dict.""" # Inputs + bases = [] input_ids = [] last_idx = -1 for row in np.argwhere(inputs): @@ -192,11 +252,13 @@ def to_example_dict(inputs, mask, outputs): assert idx > last_idx # if not, means 2 True values in 1 row # Some rows are all False. Those rows are mapped to UNK_ID. while idx != last_idx + 1: - input_ids.append(UNK_ID + text_encoder.NUM_RESERVED_TOKENS) + bases.append(encoder.UNK) last_idx += 1 - input_ids.append(base_id + text_encoder.NUM_RESERVED_TOKENS) + bases.append(_bases[base_id]) last_idx = idx - assert len(inputs) == len(input_ids) + assert len(inputs) == len(bases) + + input_ids = encoder.encode(bases) input_ids.append(text_encoder.EOS_ID) # Targets: mask and output @@ -211,3 +273,62 @@ def to_example_dict(inputs, mask, outputs): ex_dict = dict( zip(example_keys, [input_ids, targets_mask, targets, targets_shape])) return ex_dict + + +class GeneticBaseEncoder(text_encoder.TextEncoder): + """ACTG strings to ints and back. Optionally chunks bases into single ids. + + Uses 'X' as an unknown base. + """ + UNK = "X" + PAD = "0" + + def __init__(self, + chunk_size=1, + num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS): + super(GeneticBaseEncoder, self).__init__(num_reserved_ids=num_reserved_ids) + # Build a vocabulary of chunks of size chunk_size + self._chunk_size = chunk_size + chunks = [] + for size in range(1, chunk_size + 1): + c = itertools.product(_bases + [GeneticBaseEncoder.UNK], repeat=size) + num_pad = chunk_size - size + padding = (GeneticBaseEncoder.PAD,) * num_pad + c = [el + padding for el in c] + chunks.extend(c) + chunks.sort() + ids = range(self._num_reserved_ids, len(chunks) + self._num_reserved_ids) + self._ids_to_chunk = dict(zip(ids, chunks)) + self._chunks_to_ids = dict(zip(chunks, ids)) + + @property + def vocab_size(self): + return len(self._ids_to_chunk) + self._num_reserved_ids + + def encode(self, s): + bases = list(s) + pad = [GeneticBaseEncoder.PAD] * (len(bases) % self._chunk_size) + bases.extend(pad) + assert (len(bases) % self._chunk_size) == 0 + num_chunks = len(bases) // self._chunk_size + ids = [] + for chunk_idx in xrange(num_chunks): + start_idx = chunk_idx * self._chunk_size + end_idx = start_idx + self._chunk_size + chunk = tuple(bases[start_idx:end_idx]) + if chunk not in self._chunks_to_ids: + raise ValueError("Unrecognized chunk %s" % chunk) + ids.append(self._chunks_to_ids[chunk]) + return ids + + def decode(self, ids): + bases = [] + for idx in ids: + if idx >= self._num_reserved_ids: + chunk = self._ids_to_chunk[idx] + if GeneticBaseEncoder.PAD in chunk: + chunk = chunk[:chunk.index(GeneticBaseEncoder.PAD)] + else: + chunk = [text_encoder.RESERVED_TOKENS[idx]] + bases.extend(chunk) + return "".join(bases) diff --git a/tensor2tensor/data_generators/genetics_test.py b/tensor2tensor/data_generators/genetics_test.py index 85d70f934..5eac1b249 100644 --- a/tensor2tensor/data_generators/genetics_test.py +++ b/tensor2tensor/data_generators/genetics_test.py @@ -30,21 +30,28 @@ class GeneticsTest(tf.test.TestCase): def _oneHotBases(self, bases): + ref = ["A", "C", "T", "G"] one_hots = [] - for base_id in bases: + for base in bases: one_hot = [False] * 4 - if base_id < 4: - one_hot[base_id] = True + if base in ref: + one_hot[ref.index(base)] = True one_hots.append(one_hot) return np.array(one_hots) def testRecordToExample(self): - inputs = self._oneHotBases([0, 1, 3, 4, 1, 0]) + encoder = genetics.GeneticBaseEncoder(chunk_size=2) + raw_inputs = ["A", "C", "G", "X", "C", "T"] + + # Put in numpy arrays in the same format as in the h5 file + inputs = self._oneHotBases(raw_inputs) mask = np.array([True, False, True]) outputs = np.array([[1.0, 2.0, 3.0], [5.0, 1.0, 0.2], [5.1, 2.3, 2.3]]) - ex_dict = genetics.to_example_dict(inputs, mask, outputs) + # Convert to example dict + ex_dict = genetics.to_example_dict(encoder, inputs, mask, outputs) - self.assertAllEqual([2, 3, 5, 6, 3, 2, 1], ex_dict["inputs"]) + self.assertEqual(len(raw_inputs) // 2 + 1, len(ex_dict["inputs"])) + self.assertAllEqual(encoder.encode(raw_inputs) + [1], ex_dict["inputs"]) self.assertAllEqual([1.0, 0.0, 1.0], ex_dict["targets_mask"]) self.assertAllEqual([1.0, 2.0, 3.0, 5.0, 1.0, 0.2, 5.1, 2.3, 2.3], ex_dict["targets"]) diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index f8e3191a2..acb1128ed 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -307,14 +307,38 @@ def mscoco_generator(data_dir, "image/width": [width] } + +class ImageProblem(problem.Problem): + + def example_reading_spec(self, label_key=None): + if label_key is None: + label_key = "image/class/label" + + data_fields = { + "image/encoded": tf.FixedLenFeature((), tf.string), + "image/format": tf.FixedLenFeature((), tf.string), + label_key: tf.VarLenFeature(tf.int64) + } + data_items_to_decoders = { + "inputs": + tf.contrib.slim.tfexample_decoder.Image( + image_key="image/encoded", + format_key="image/format", + channels=3), + "targets": + tf.contrib.slim.tfexample_decoder.Tensor(label_key), + } + + return data_fields, data_items_to_decoders + # French street names dataset. @registry.register_problem -class ImageFSNS(problem.Problem): +class ImageFSNS(ImageProblem): """Problem spec for French Street Name recognition.""" - def generate_data(self, data_dir, tmp_dir): + def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" "street/python/fsns_urls.txt") fsns_urls = generator_utils.maybe_download( @@ -351,6 +375,10 @@ def hparams(self, defaults, model_hparams): p.input_space_id = problem.SpaceID.DIGIT_0 p.target_space_id = problem.SpaceID.DIGIT_1 + def example_reading_spec(self): + label_key = "image/unpadded_label" + return super(ImageFSNS, self).example_reading_spec(self, + label_key=label_key) # Filename for CELEBA data. _CELEBA_NAME = "img_align_celeba" diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 99f8e97de..02e198c03 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -70,6 +70,10 @@ class SpaceID(object): ICE_PARSE_TOK = 19 # Macedonian tokens MK_TOK = 20 + # Genetic bases (ACTG) + DNA = 21 + # Real numbers + REAL = 22 class Problem(object): @@ -131,6 +135,18 @@ def feature_encoders(self, data_dir): "targets": text_encoder.TextEncoder() } + def example_reading_spec(self): + data_fields = { + "inputs": tf.VarLenFeature(tf.int64), + "targets": tf.VarLenFeature(tf.int64) + } + data_items_to_decoders = None + return (data_fields, data_items_to_decoders) + + def preprocess_examples(self, examples, mode): + del mode + return examples + # ============================================================================ # END SUBCLASS INTERFACE # ============================================================================ @@ -193,6 +209,17 @@ def internal_hparams(self, model_hparams): _copy_problem_hparams(hp) return hp + def maybe_reverse_features(self, feature_map): + if not self._was_reversed: + return + inputs, targets = feature_map["inputs"], feature_map["targets"] + feature_map["inputs"], feature_map["targets"] = targets, inputs + + def maybe_copy_features(self, feature_map): + if not self._was_copy: + return + feature_map["targets"] = feature_map["inputs"] + def _copy_problem_hparams(p_hparams): """Use input modality, vocab, and space id for target.""" diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index bb31d0c0f..3fc74473a 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -92,7 +92,7 @@ def target_space_id(self): def num_shards(self): return 100 - def generate_data(self, data_dir, tmp_dir, num_shards=None): + def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): if num_shards is None: num_shards = self.num_shards if self.is_character_level: diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py index a86974d1f..e36b2e4e1 100644 --- a/tensor2tensor/models/common_hparams.py +++ b/tensor2tensor/models/common_hparams.py @@ -50,6 +50,8 @@ def basic_params1(): # when not in training mode. dropout=0.2, clip_grad_norm=2.0, + grad_noise_scale=0.0, + summarize_grads=int(False), initializer="orthogonal", initializer_gain=1.5, label_smoothing=0.1, diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py index 9a6115558..50a3da55d 100644 --- a/tensor2tensor/models/modalities.py +++ b/tensor2tensor/models/modalities.py @@ -181,12 +181,11 @@ def top(self, body_output, _): shape = tf.shape(body_output_split[i])[:-1] body_output = tf.reshape(body_output_split[i], [-1, self._body_input_depth]) - channel_logits = tf.matmul(body_output, - output_rgb_embedding_var[i], - transpose_b=True) - rgb_channel_logits.append(tf.reshape( - channel_logits, tf.concat([shape, [self.top_dimensionality]], - 0))) + channel_logits = tf.matmul( + body_output, output_rgb_embedding_var[i], transpose_b=True) + rgb_channel_logits.append( + tf.reshape(channel_logits, + tf.concat([shape, [self.top_dimensionality]], 0))) logits = tf.concat(rgb_channel_logits, axis=3) # Reshape logits to conform to CIFAR image shapes (32 by 32 by 3) @@ -468,6 +467,33 @@ def top(self, body_output, _): return body_output +@registry.register_generic_modality("real") +class RealModality(modality.Modality): + """Modality for real (i.e. float) vectors.""" + + def bottom(self, x): + with tf.variable_scope("real"): + return tf.layers.dense(x, self._body_input_depth) + + def top(self, body_output, _): + with tf.variable_scope("real"): + return tf.layers.dense(body_output, self._vocab_size) + + def top_sharded(self, + sharded_body_output, + sharded_targets, + data_parallelism, + weights_fn=common_layers.weights_nonzero): + sharded_predictions = data_parallelism(self.top, sharded_body_output, + sharded_targets) + + def l2_loss(predictions, targets): + return tf.reduce_mean(tf.pow(predictions - targets, 2)) + + loss = data_parallelism(l2_loss, sharded_predictions, sharded_targets) + return sharded_predictions, tf.add_n(loss) + + @registry.register_image_modality("identity_no_pad") class IdentityModalityNoPad(modality.Modality): """Does nothing except making sure that there is no padding in cross-ent.""" diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 23197fcd9..c45e88577 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -46,8 +46,8 @@ def model_fn_body(self, features): # Remove dropout if not training hparams = copy.copy(self._hparams) targets = features["targets"] - inputs = features.get("inputs") - target_space = features.get("target_space_id") + inputs = features["inputs"] + target_space = features["target_space_id"] inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index d7af960ab..24dd31485 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -138,10 +138,12 @@ def preprocessing(examples, data_file_pattern, mode): # Small single-example pre-processing for images. def resize(img, size): return tf.to_int64(tf.image.resize_images(img, [size, size])) + def preprocess(img): img = tf.image.resize_images(img, [360, 360]) img = common_layers.image_augmentation(tf.to_float(img) / 255.) return tf.to_int64(img * 255.) + if ("image_imagenet" in data_file_pattern or "image_mscoco" in data_file_pattern): examples["inputs"] = tf.cast(examples["inputs"], tf.int64) @@ -154,8 +156,8 @@ def preprocess(img): lambda img=inputs: resize(img, 299)) else: examples["inputs"] = tf.to_int64(resize(inputs, 299)) - elif ("image_cifar10" in data_file_pattern - and mode == tf.contrib.learn.ModeKeys.TRAIN): + elif ("image_cifar10" in data_file_pattern and + mode == tf.contrib.learn.ModeKeys.TRAIN): examples["inputs"] = common_layers.cifar_image_augmentation( examples["inputs"]) elif "img2img" in data_file_pattern: @@ -182,8 +184,62 @@ def preprocess(img): return examples -def input_pipeline(data_file_pattern, capacity, mode): +def problem_input_pipeline(problem, data_file_pattern, capacity, mode): + """Input pipeline for Problems.""" + data_fields, data_items_to_decoders = problem.example_reading_spec() + + # Create placeholders for input, rather than reading data from disk. + if data_file_pattern is None: + return feature_placeholders(data_fields) + + # Now the non-trivial case construction. + examples = examples_queue( + [data_file_pattern], + data_fields, + training=(mode == tf.contrib.learn.ModeKeys.TRAIN), + capacity=capacity, + data_items_to_decoders=data_items_to_decoders) + + examples = problem.preprocess_examples(examples, mode) + + # We do not want int64s as they are not supported on GPUs. + examples = cast_int64_to_int32(examples) + + return examples + + +def cast_int64_to_int32(features): + f = {} + for k, v in six.iteritems(features): + if v.dtype == tf.int64: + v = tf.to_int32(v) + f[k] = v + return f + + +def feature_placeholders(data_fields): + feature_map = {} + for (field, tp) in data_fields: + if not field.startswith("targets"): + feature_map[field] = tf.placeholder( + dtype=tp, shape=[None] * 4, name=field) + return feature_map + + +def input_pipeline(problem, data_file_pattern, capacity, mode): """Input pipeline, returns a dictionary of tensors from queues.""" + + if problem is not None: + # problem is not None when the problem is specified with the Problem API, + # which handles Example decoding and preprocessing. + # Otherwise the problem is specified in problem_hparams and is dealt with + # below. + # As problems are ported to the Problem API, the special handling here will + # need to be moved to Problem.example_reading_spec and + # Problem.preprocessing. + return problem_input_pipeline(problem, data_file_pattern, capacity, mode) + + data_items_to_decoders = None # Read from image TFRecords if the file has "image" in its name. if data_file_pattern and "image" in data_file_pattern: label_key = "image/class/label" @@ -211,22 +267,15 @@ def input_pipeline(data_file_pattern, capacity, mode): "audio/sample_width": tf.FixedLenFeature((), tf.int64), "targets": tf.VarLenFeature(tf.int64), } - data_items_to_decoders = None else: data_fields = { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) } - data_items_to_decoders = None # Create placeholders for input, rather than reading data from disk. if data_file_pattern is None: - feature_map = {} - for (field, tp) in data_fields: - if field != "targets": - feature_map[field] = tf.placeholder( - dtype=tp, shape=[None] * 4, name=field) - return feature_map + return feature_placeholders(data_fields) # Now the non-trivial case construction. examples = examples_queue( @@ -238,8 +287,9 @@ def input_pipeline(data_file_pattern, capacity, mode): examples = preprocessing(examples, data_file_pattern, mode) - # We do not want int64s as they do are not supported on GPUs. - return {k: tf.to_int32(v) for (k, v) in six.iteritems(examples)} + # We do not want int64s as they are not supported on GPUs. + examples = cast_int64_to_int32(examples) + return examples def batch_examples(examples, batching_scheme): diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py index a42f35c24..72169be1f 100644 --- a/tensor2tensor/utils/modality.py +++ b/tensor2tensor/utils/modality.py @@ -43,7 +43,7 @@ class Modality(object): function targets_bottom represents the auto-regressive part of the network. It is applied to the already-generated part of an image, which is given to the decoder to generate the next part. In some cases, e.g., for text, it is - the same as the inputs_bottom function, as that is the default we use. But, + the same as the inputs_bottom function, and that is the default we use. But, e.g., for images, a different function might be needed to regress properly. All 3 functions have simple and sharded versions. A sub-class only needs diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 96c43a5a0..c4bdcf942 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -229,13 +229,16 @@ def create_hparams(params_id, data_dir): # Add hparams for the problems hparams.problems = [] + hparams.problem_instances = [] for problem_name in FLAGS.problems.split("-"): try: problem = registry.problem(problem_name) p_hparams = problem.internal_hparams(hparams) except ValueError: + problem = None p_hparams = problem_hparams.problem_hparams(problem_name, hparams) + hparams.problem_instances.append(problem) hparams.problems.append(p_hparams) return hparams @@ -304,9 +307,10 @@ def session_config(): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.worker_gpu_memory_fraction) - config = tf.ConfigProto(allow_soft_placement=True, - graph_options=graph_options, - gpu_options=gpu_options) + config = tf.ConfigProto( + allow_soft_placement=True, + graph_options=graph_options, + gpu_options=gpu_options) return config @@ -422,8 +426,12 @@ def model_fn(features, targets, mode): def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( - hparams, mode, hparams.problems[n], - n, dp, _ps_devices(all_workers=True)) + hparams, + mode, + hparams.problems[n], + n, + dp, + _ps_devices(all_workers=True)) if mode == tf.contrib.learn.ModeKeys.INFER: return model_class.infer( features, @@ -485,8 +493,8 @@ def nth_model(n): if mode == tf.contrib.learn.ModeKeys.EVAL: logits = tf.concat(sharded_logits, 0) if FLAGS.eval_print: - logits = tf.Print(logits, [features["inputs"], logits], - "EVAL PRINT", summarize=10000) + logits = tf.Print( + logits, [features["inputs"], logits], "EVAL PRINT", summarize=10000) # For evaluation, return the logits layer as our predictions. run_info["predictions"] = logits train_op = None @@ -544,19 +552,24 @@ def nth_model(n): # Define the train_op for the TRAIN mode. opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams) tf.logging.info("Computing gradients for global model_fn.") + opt_summaries = ["learning_rate", "loss", "global_gradient_norm"] + if hparams.summarize_grads: + opt_summaries.extend(["gradients", "gradient_norm"]) train_op = tf.contrib.layers.optimize_loss( name="training", loss=total_loss, global_step=tf.contrib.framework.get_global_step(), learning_rate=learning_rate, clip_gradients=hparams.clip_grad_norm or None, + gradient_noise_scale=hparams.grad_noise_scale or None, optimizer=opt, + summaries=opt_summaries, colocate_gradients_with_ops=True) # Remove summaries that will fail to run because they are in conditionals. # TODO(cwhipkey): Test with this code removed, later in 2017. summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES) - for i in range(len(summaries)-1, -1, -1): + for i in range(len(summaries) - 1, -1, -1): if summaries[i].name.startswith("cond_"): del summaries[i] @@ -602,8 +615,7 @@ def decode_from_dataset(estimator): data_file_patterns=infer_problems_data, num_datashards=data_parallelism().n, fixed_problem=i) - result_iter = estimator.predict( - input_fn=infer_input_fn, as_iterable=False) + result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False) def log_fn(inputs, targets, @@ -735,8 +747,8 @@ def decode_interactively(estimator): else: tf.logging.info(beam_string) else: - tf.logging.info(targets_vocab.decode(_save_until_eos( - result["outputs"].flatten()))) + tf.logging.info( + targets_vocab.decode(_save_until_eos(result["outputs"].flatten()))) def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, @@ -749,8 +761,8 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, tf.logging.info("Decoding batch %d" % b) batch_length = 0 batch_inputs = [] - for inputs in sorted_inputs[b * FLAGS.decode_batch_size: - (b + 1) * FLAGS.decode_batch_size]: + for inputs in sorted_inputs[b * FLAGS.decode_batch_size:( + b + 1) * FLAGS.decode_batch_size]: input_ids = vocabulary.encode(inputs) if FLAGS.decode_max_input_size > 0: # Subtract 1 for the EOS_ID. @@ -1048,12 +1060,13 @@ def input_fn(): for n in xrange(problem_count): if fixed_problem is not None and n != fixed_problem: continue + problem_instance = hparams.problem_instances[n] with tf.name_scope("problem_%d" % n): with tf.device("/cpu:0"): # Input queues are on CPU. capacity = hparams.problems[n].max_expected_batch_size_per_shard capacity *= num_datashards - examples = data_reader.input_pipeline(data_file_patterns[n], - capacity, mode) + examples = data_reader.input_pipeline( + problem_instance, data_file_patterns[n], capacity, mode) if mode == tf.contrib.learn.ModeKeys.TRAIN: drop_long_sequences = True else: @@ -1068,15 +1081,18 @@ def input_fn(): length_multiplier=batch_size_multiplier)) # Reverse inputs and targets features if the problem was reversed. - if hparams.problems[n].was_reversed: - inputs = feature_map["inputs"] - targets = feature_map["targets"] - feature_map["inputs"] = targets - feature_map["targets"] = inputs - - # Use the inputs as the targets if the problem is a copy problem. - if hparams.problems[n].was_copy: - feature_map["targets"] = feature_map["inputs"] + if problem_instance is not None: + problem_instance.maybe_reverse_features(feature_map) + problem_instance.maybe_copy_features(feature_map) + else: + if hparams.problems[n].was_reversed: + inputs = feature_map["inputs"] + targets = feature_map["targets"] + feature_map["inputs"] = targets + feature_map["targets"] = inputs + # Use the inputs as the targets if the problem is a copy problem. + if hparams.problems[n].was_copy: + feature_map["targets"] = feature_map["inputs"] # Ensure inputs and targets are proper rank. while len(feature_map["inputs"].get_shape()) != 4: @@ -1117,9 +1133,9 @@ def input_fn(): assert FLAGS.worker_replicas % problem_count == 0 problem_choice = tf.to_int32(FLAGS.worker_id % problem_count) else: - raise ValueError("Value of hparams.problem_choice is %s and must be " - "one of [uniform, adaptive, distributed]" % - hparams.problem_choice) + raise ValueError( + "Value of hparams.problem_choice is %s and must be " + "one of [uniform, adaptive, distributed]" % hparams.problem_choice) # Inputs and targets conditional on problem_choice. rand_inputs, rand_target, choice, inp_id, tgt_id = _cond_on_index( From 315647cdbf6efc78591f3047627ca064c75c31dc Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Mon, 24 Jul 2017 18:51:17 -0700 Subject: [PATCH 06/21] Download newer WMT dev set. PiperOrigin-RevId: 163020223 --- tensor2tensor/data_generators/problem.py | 8 +- .../data_generators/problem_hparams.py | 3 + tensor2tensor/data_generators/wmt.py | 316 ++++++++++-------- tensor2tensor/models/common_attention.py | 3 + tensor2tensor/utils/get_ende_bleu.sh | 4 +- tensor2tensor/utils/trainer_utils.py | 3 +- 6 files changed, 186 insertions(+), 151 deletions(-) diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 02e198c03..22b6214e6 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -70,10 +70,14 @@ class SpaceID(object): ICE_PARSE_TOK = 19 # Macedonian tokens MK_TOK = 20 + # Czech tokens + CS_TOK = 21 + # Czech characters + CS_CHR = 22 # Genetic bases (ACTG) - DNA = 21 + DNA = 23 # Real numbers - REAL = 22 + REAL = 24 class Problem(object): diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index 4343afd27..159ea6ac9 100644 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -181,6 +181,9 @@ def default_problem_hparams(): # 17: Icelandic characters # 18: Icelandic tokens # 19: Icelandic parse tokens + # 20: Macedonian tokens + # 21: Czech tokens + # 22: Czech characters # Add more above if needed. input_space_id=0, target_space_id=0, diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 3fc74473a..50125ccd1 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -43,23 +43,6 @@ EOS = text_encoder.EOS_ID -def _default_token_feature_encoders(data_dir, target_vocab_size): - vocab_filename = os.path.join(data_dir, - "vocab.endefr.%d" % target_vocab_size) - subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) - return { - "inputs": subtokenizer, - "targets": subtokenizer, - } - - -def _default_character_feature_encoders(): - return { - "inputs": text_encoder.ByteTextEncoder(), - "targets": text_encoder.ByteTextEncoder(), - } - - class WMTProblem(problem.Problem): """Base class for WMT problems.""" @@ -71,14 +54,13 @@ def is_character_level(self): def targeted_vocab_size(self): raise NotImplementedError() # Not needed if self.is_character_level. - @property - def train_generator(self): - """Generator; takes data_dir, tmp_dir, is_training, targeted_vocab_size.""" + def train_generator(self, data_dir, tmp_dir, is_training): + """Generator of the training data.""" raise NotImplementedError() - @property - def dev_generator(self): - return self.train_generator + def dev_generator(self, data_dir, tmp_dir): + """Generator of the development data.""" + return self.train_generator(data_dir, tmp_dir, False) @property def input_space_id(self): @@ -92,28 +74,35 @@ def target_space_id(self): def num_shards(self): return 100 + @property + def vocab_name(self): + return "vocab.endefr" + + @property + def vocab_file(self): + return "%s.%d" % (self.vocab_name, self.targeted_vocab_size) + def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): if num_shards is None: num_shards = self.num_shards - if self.is_character_level: - generator_utils.generate_dataset_and_shuffle( - self.train_generator(tmp_dir, True), - self.training_filepaths(data_dir, num_shards, shuffled=False), - self.dev_generator(tmp_dir, False), - self.dev_filepaths(data_dir, 1, shuffled=False)) - else: - generator_utils.generate_dataset_and_shuffle( - self.train_generator(data_dir, tmp_dir, True, - self.targeted_vocab_size), - self.training_filepaths(data_dir, num_shards, shuffled=False), - self.dev_generator(data_dir, tmp_dir, False, - self.targeted_vocab_size), - self.dev_filepaths(data_dir, 1, shuffled=False)) + generator_utils.generate_dataset_and_shuffle( + self.train_generator(data_dir, tmp_dir, True), + self.training_filepaths(data_dir, num_shards, shuffled=False), + self.dev_generator(data_dir, tmp_dir), + self.dev_filepaths(data_dir, 1, shuffled=False)) def feature_encoders(self, data_dir): if self.is_character_level: - return _default_character_feature_encoders() - return _default_token_feature_encoders(data_dir, self.targeted_vocab_size) + return { + "inputs": text_encoder.ByteTextEncoder(), + "targets": text_encoder.ByteTextEncoder(), + } + vocab_filename = os.path.join(data_dir, self.vocab_file) + subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) + return { + "inputs": subtokenizer, + "targets": subtokenizer, + } def hparams(self, defaults, unused_model_hparams): p = defaults @@ -175,8 +164,8 @@ def tabbed_generator(source_path, source_vocab, target_vocab, eos=None): Args: source_path: path to the file with source and target sentences. - source_vocab: a SunwordTextEncoder to encode the source string. - target_vocab: a SunwordTextEncoder to encode the target string. + source_vocab: a SubwordTextEncoder to encode the source string. + target_vocab: a SubwordTextEncoder to encode the target string. eos: integer to append at the end of each sequence (default: None). Yields: @@ -262,7 +251,7 @@ def bi_vocabs_token_generator(source_path, _ENDE_TRAIN_DATASETS = [ [ - "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long + "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long ("training-parallel-nc-v11/news-commentary-v11.de-en.en", "training-parallel-nc-v11/news-commentary-v11.de-en.de") ], @@ -277,7 +266,7 @@ def bi_vocabs_token_generator(source_path, ] _ENDE_TEST_DATASETS = [ [ - "http://data.statmt.org/wmt16/translation-task/dev.tgz", + "http://data.statmt.org/wmt17/translation-task/dev.tgz", ("dev/newstest2013.en", "dev/newstest2013.de") ], ] @@ -307,7 +296,7 @@ def bi_vocabs_token_generator(source_path, ] _ENFR_TEST_DATASETS = [ [ - "http://data.statmt.org/wmt16/translation-task/dev.tgz", + "http://data.statmt.org/wmt17/translation-task/dev.tgz", ("dev/newstest2013.en", "dev/newstest2013.fr") ], ] @@ -337,6 +326,29 @@ def bi_vocabs_token_generator(source_path, ("dev.mk", "dev.en") ]] +# English-Czech datasets +_ENCS_TRAIN_DATASETS = [ + [ + "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long + ("training-parallel-nc-v11/news-commentary-v11.cs-en.en", + "training-parallel-nc-v11/news-commentary-v11.cs-en.cs") + ], + [ + "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", + ("commoncrawl.cs-en.en", "commoncrawl.cs-en.cs") + ], + [ + "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", + ("training/europarl-v7.cs-en.en", "training/europarl-v7.cs-en.cs") + ], +] +_ENCS_TEST_DATASETS = [ + [ + "http://data.statmt.org/wmt17/translation-task/dev.tgz", + ("dev/newstest2013.en", "dev/newstest2013.cs") + ], +] + # Generators. @@ -408,16 +420,6 @@ def _compile_data(tmp_dir, datasets, filename): return filename -def ende_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size): - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size) - datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) - return token_generator(data_path + ".lang1", data_path + ".lang2", - symbolizer_vocab, EOS) - - @registry.register_problem("wmt_ende_tokens_8k") class WMTEnDeTokens8k(WMTProblem): """Problem spec for WMT En-De translation.""" @@ -426,9 +428,14 @@ class WMTEnDeTokens8k(WMTProblem): def targeted_vocab_size(self): return 2**13 # 8192 - @property - def train_generator(self): - return ende_wordpiece_token_generator + def train_generator(self, data_dir, tmp_dir, train): + symbolizer_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size) + datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, EOS) @property def input_space_id(self): @@ -447,15 +454,6 @@ def targeted_vocab_size(self): return 2**15 # 32768 -def ende_character_generator(tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() - datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag) - return character_generator(data_path + ".lang1", data_path + ".lang2", - character_vocab, EOS) - - @registry.register_problem("wmt_ende_characters") class WMTEnDeCharacters(WMTProblem): """Problem spec for WMT En-De translation.""" @@ -464,9 +462,13 @@ class WMTEnDeCharacters(WMTProblem): def is_character_level(self): return True - @property - def train_generator(self): - return ende_character_generator + def train_generator(self, tmp_dir, train): + character_vocab = text_encoder.ByteTextEncoder() + datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag) + return character_generator(data_path + ".lang1", data_path + ".lang2", + character_vocab, EOS) @property def input_space_id(self): @@ -477,29 +479,6 @@ def target_space_id(self): return problem.SpaceID.DE_CHR -def zhen_wordpiece_token_bigenerator(data_dir, tmp_dir, train, - source_vocab_size, target_vocab_size): - """Wordpiece generator for the WMT'17 zh-en dataset.""" - datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS - source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS] - target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS] - source_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, "vocab.zh.%d" % source_vocab_size, - source_vocab_size, source_datasets) - target_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, "vocab.en.%d" % target_vocab_size, - target_vocab_size, target_datasets) - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) - return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", - source_vocab, target_vocab, EOS) - - -def zhen_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size): - return zhen_wordpiece_token_bigenerator(data_dir, tmp_dir, train, - vocab_size, vocab_size) - - @registry.register_problem("wmt_zhen_tokens_8k") class WMTZhEnTokens8k(WMTProblem): """Problem spec for WMT Zh-En translation.""" @@ -508,9 +487,22 @@ class WMTZhEnTokens8k(WMTProblem): def targeted_vocab_size(self): return 2**13 # 8192 - @property - def train_generator(self): - return zhen_wordpiece_token_generator + def train_generator(self, data_dir, tmp_dir, train): + source_vocab_size = self.targeted_vocab_size + target_vocab_size = self.targeted_vocab_size + datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS + source_datasets = [[item[0], [item[1][0]]] for item in datasets] + target_datasets = [[item[0], [item[1][1]]] for item in datasets] + source_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, "vocab.zh.%d" % source_vocab_size, source_vocab_size, + source_datasets) + target_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, "vocab.en.%d" % target_vocab_size, target_vocab_size, + target_datasets) + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) + return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", + source_vocab, target_vocab, EOS) @property def input_space_id(self): @@ -542,17 +534,6 @@ def targeted_vocab_size(self): return 2**15 # 32768 -def enfr_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size): - """Instance of token generator for the WMT en->fr task.""" - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size) - datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) - return token_generator(data_path + ".lang1", data_path + ".lang2", - symbolizer_vocab, EOS) - - @registry.register_problem("wmt_enfr_tokens_8k") class WMTEnFrTokens8k(WMTProblem): """Problem spec for WMT En-Fr translation.""" @@ -561,9 +542,14 @@ class WMTEnFrTokens8k(WMTProblem): def targeted_vocab_size(self): return 2**13 # 8192 - @property - def train_generator(self): - return enfr_wordpiece_token_generator + def train_generator(self, data_dir, tmp_dir, train): + symbolizer_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size) + datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, EOS) @property def input_space_id(self): @@ -582,16 +568,6 @@ def targeted_vocab_size(self): return 2**15 # 32768 -def enfr_character_generator(tmp_dir, train): - """Instance of character generator for the WMT en->fr task.""" - character_vocab = text_encoder.ByteTextEncoder() - datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag) - return character_generator(data_path + ".lang1", data_path + ".lang2", - character_vocab, EOS) - - @registry.register_problem("wmt_enfr_characters") class WMTEnFrCharacters(WMTProblem): """Problem spec for WMT En-Fr translation.""" @@ -600,9 +576,13 @@ class WMTEnFrCharacters(WMTProblem): def is_character_level(self): return True - @property - def train_generator(self): - return enfr_character_generator + def train_generator(self, data_dir, tmp_dir, train): + character_vocab = text_encoder.ByteTextEncoder() + datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag) + return character_generator(data_path + ".lang1", data_path + ".lang2", + character_vocab, EOS) @property def input_space_id(self): @@ -613,20 +593,6 @@ def target_space_id(self): return problem.SpaceID.FR_CHR -def mken_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size): - """Wordpiece generator for the SETimes Mk-En dataset.""" - datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS - source_datasets = [[item[0], [item[1][0]]] for item in _MKEN_TRAIN_DATASETS] - target_datasets = [[item[0], [item[1][1]]] for item in _MKEN_TRAIN_DATASETS] - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, "vocab.mken.%d" % vocab_size, vocab_size, - source_datasets + target_datasets) - tag = "train" if train else "dev" - data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) - return token_generator(data_path + ".lang1", data_path + ".lang2", - symbolizer_vocab, EOS) - - @registry.register_problem("setimes_mken_tokens_32k") class SETimesMkEnTokens32k(WMTProblem): """Problem spec for SETimes Mk-En translation.""" @@ -636,8 +602,20 @@ def targeted_vocab_size(self): return 2**15 # 32768 @property - def train_generator(self): - return mken_wordpiece_token_generator + def vocab_name(self): + return "vocab.mken" + + def train_generator(self, data_dir, tmp_dir, train): + datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS + source_datasets = [[item[0], [item[1][0]]] for item in datasets] + target_datasets = [[item[0], [item[1][1]]] for item in datasets] + symbolizer_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, + source_datasets + target_datasets) + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, EOS) @property def input_space_id(self): @@ -648,12 +626,62 @@ def target_space_id(self): return problem.SpaceID.EN_TOK -def parsing_character_generator(tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() - filename = "parsing_%s" % ("train" if train else "dev") - text_filepath = os.path.join(tmp_dir, filename + ".text") - tags_filepath = os.path.join(tmp_dir, filename + ".tags") - return character_generator(text_filepath, tags_filepath, character_vocab, EOS) +@registry.register_problem("wmt_encs_tokens_32k") +class WMTEnCsTokens32k(problem.Problem): + """Problem spec for WMT English-Czech translation.""" + + @property + def target_vocab_size(self): + return 2**15 # 32768 + + @property + def vocab_name(self): + return "vocab.encs" + + def train_generator(self, data_dir, tmp_dir, train): + datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS + source_datasets = [[item[0], [item[1][0]]] for item in datasets] + target_datasets = [[item[0], [item[1][1]]] for item in datasets] + symbolizer_vocab = generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, + source_datasets + target_datasets) + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_encs_tok_%s" % tag) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, EOS) + + @property + def input_space_id(self): + return problem.SpaceID.EN_TOK + + @property + def target_space_id(self): + return problem.SpaceID.CS_TOK + + +@registry.register_problem("wmt_encs_characters") +class WMTEnCsCharacters(WMTProblem): + """Problem spec for WMT En-Cs character-based translation.""" + + @property + def is_character_level(self): + return True + + def train_generator(self, data_dir, tmp_dir, train): + character_vocab = text_encoder.ByteTextEncoder() + datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_encs_chr_%s" % tag) + return character_generator(data_path + ".lang1", data_path + ".lang2", + character_vocab, EOS) + + @property + def input_space_id(self): + return problem.SpaceID.EN_CHR + + @property + def target_space_id(self): + return problem.SpaceID.CS_CHR def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py index 4f694a7f9..624623f4c 100644 --- a/tensor2tensor/models/common_attention.py +++ b/tensor2tensor/models/common_attention.py @@ -66,6 +66,9 @@ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) + signal = tf.reshape(signal, [length, 2, num_timescales]) + signal = tf.transpose(signal, perm=[0, 2, 1]) + signal = tf.reshape(signal, [length, channels]) signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]]) signal = tf.reshape(signal, [1, length, channels]) return x + signal diff --git a/tensor2tensor/utils/get_ende_bleu.sh b/tensor2tensor/utils/get_ende_bleu.sh index 09078414f..3493af74c 100755 --- a/tensor2tensor/utils/get_ende_bleu.sh +++ b/tensor2tensor/utils/get_ende_bleu.sh @@ -5,10 +5,8 @@ tok_gold_targets=newstest2013.tok.de decodes_file=$1 -cut -d' ' -f1 $decodes_file > $decodes_file.target - # Tokenize. -perl $mosesdecoder/scripts/tokenizer/tokenizer.perl -l de < $decodes_file.target > $decodes_file.tok +perl $mosesdecoder/scripts/tokenizer/tokenizer.perl -l de < $decodes_file > $decodes_file.tok # Put compounds in ATAT format (comparable to papers like GNMT, ConvS2S). # See https://nlp.stanford.edu/projects/nmt/ : diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index c4bdcf942..0943881f3 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -722,8 +722,7 @@ def log_fn(inputs, outputs): tf.logging.info("Writing decodes into %s" % decode_filename) outfile = tf.gfile.Open(decode_filename, "w") for index in range(len(sorted_inputs)): - outfile.write("%s\t%s\n" % (decodes[sorted_keys[index]], - sorted_inputs[sorted_keys[index]])) + outfile.write("%s\n" % (decodes[sorted_keys[index]])) def decode_interactively(estimator): From bea499320874dc617631c52632f43ffd782542b7 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Tue, 25 Jul 2017 09:05:13 -0700 Subject: [PATCH 07/21] Clean up some code around escaping/unescaping tokens and add tests. PiperOrigin-RevId: 163077617 --- tensor2tensor/data_generators/text_encoder.py | 248 +++++++++--------- .../data_generators/text_encoder_test.py | 68 +++++ 2 files changed, 199 insertions(+), 117 deletions(-) create mode 100644 tensor2tensor/data_generators/text_encoder_test.py diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 7c53784f3..afe1da9ae 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -24,15 +24,12 @@ from __future__ import division from __future__ import print_function -from collections import defaultdict +import collections import re # Dependency imports import six -from six import PY2 -from six import unichr # pylint: disable=redefined-builtin -from six.moves import xrange # pylint: disable=redefined-builtin from tensor2tensor.data_generators import tokenizer import tensorflow as tf @@ -46,7 +43,7 @@ PAD_ID = RESERVED_TOKENS.index(PAD) # Normally 0 EOS_ID = RESERVED_TOKENS.index(EOS) # Normally 1 -if PY2: +if six.PY2: RESERVED_TOKENS_BYTES = RESERVED_TOKENS else: RESERVED_TOKENS_BYTES = [bytes(PAD, "ascii"), bytes(EOS, "ascii")] @@ -56,18 +53,17 @@ # '\u' is converted to '_' # '\\' is converted to '\' # '\213;' is converted to unichr(213) -_UNESCAPE_REGEX = re.compile(u"|".join([r"\\u", r"\\\\", r"\\([0-9]+);"])) +_UNESCAPE_REGEX = re.compile(ur"\\u|\\\\|\\([0-9]+);") +_ESCAPE_CHARS = set(u"\\_;0123456789") def native_to_unicode_py2(s): """Python 2: transform native string to Unicode.""" - if isinstance(s, unicode): - return s - return s.decode("utf-8") + return s if isinstance(s, unicode) else s.decode("utf8") # Conversion between Unicode and UTF-8, if required (on Python2) -if PY2: +if six.PY2: native_to_unicode = native_to_unicode_py2 unicode_to_native = lambda s: s.encode("utf-8") else: @@ -131,7 +127,7 @@ class ByteTextEncoder(TextEncoder): def encode(self, s): numres = self._num_reserved_ids - if PY2: + if six.PY2: return [ord(c) + numres for c in s] # Python3: explicitly convert to UTF-8 return [c + numres for c in s.encode("utf-8")] @@ -145,7 +141,7 @@ def decode(self, ids): decoded_ids.append(RESERVED_TOKENS_BYTES[int(id_)]) else: decoded_ids.append(int2byte(id_ - numres)) - if PY2: + if six.PY2: return "".join(decoded_ids) # Python3: join byte arrays and then decode string return b"".join(decoded_ids).decode("utf-8", "replace") @@ -199,6 +195,55 @@ def _load_vocab_from_file(self, filename): self._id_to_token[idx] = tok +def _escape_token(token, alphabet): + """Escape away underscores and OOV characters and append '_'. + + This allows the token to be experessed as the concatenation of a list + of subtokens from the vocabulary. The underscore acts as a sentinel + which allows us to invertibly concatenate multiple such lists. + + Args: + token: A unicode string to be escaped. + alphabet: A set of all characters in the vocabulary's alphabet. + + Returns: + escaped_token: An escaped unicode string. + + Raises: + ValueError: If the provided token is not unicode. + """ + if not isinstance(token, six.text_type): + raise ValueError("Expected string type for token, got %s" % type(token)) + + token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u") + ret = [ + c if c in alphabet and c != u"\n" else ur"\%d;" % ord(c) + for c in token] + return u"".join(ret) + "_" + + +def _unescape_token(escaped_token): + """Inverse of _escape_token(). + + Args: + escaped_token: a unicode string + + Returns: + token: a unicode string + """ + def match(m): + if m.group(1) is None: + return u"_" if m.group(0) == u"\\u" else u"\\" + + try: + return six.unichr(int(m.group(1))) + except (ValueError, OverflowError) as _: + return "" + + trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token + return _UNESCAPE_REGEX.sub(match, trimmed) + + class SubwordTextEncoder(TextEncoder): """Class for invertibly encoding text using a limited vocabulary. @@ -276,7 +321,8 @@ def _tokens_to_subtokens(self, tokens): """ ret = [] for token in tokens: - ret.extend(self._escaped_token_to_subtokens(self._escape_token(token))) + ret.extend(self._escaped_token_to_subtokens( + _escape_token(token, self._alphabet))) return ret def _subtokens_to_tokens(self, subtokens): @@ -290,7 +336,7 @@ def _subtokens_to_tokens(self, subtokens): concatenated = "".join( [self._subtoken_to_subtoken_string(s) for s in subtokens]) split = concatenated.split("_") - return [self._unescape_token(t + "_") for t in split if t] + return [_unescape_token(t + "_") for t in split if t] def _subtoken_to_subtoken_string(self, subtoken): """Subtoken_String (string) corresponding to the given subtoken (id).""" @@ -312,12 +358,17 @@ def _escaped_token_to_subtokens(self, escaped_token): while pos < lesc: end = min(lesc, pos + self._max_subtoken_len) while end > pos: - subtoken = self._subtoken_string_to_id.get(escaped_token[pos:end], -1) - if subtoken != -1: + subtoken_id = self._subtoken_string_to_id.get(escaped_token[pos:end]) + if subtoken_id is not None: break end -= 1 - assert end > pos - ret.append(subtoken) + + # If there is no possible encoding of the escaped token then one of the + # characters in the token is not in the alphabet. This should be + # impossible and would be indicative of a bug. + assert subtoken_id is not None + + ret.append(subtoken_id) pos = end return ret @@ -331,27 +382,37 @@ def build_to_target_size(cls, num_iterations=4): """Builds a SubwordTextEncoder that has `vocab_size` near `target_size`. - Uses simple recursive binary search to find a `min_count` value that most + Uses simple recursive binary search to find a minimum token count that most closely matches the `target_size`. Args: - target_size: desired vocab_size to approximate. - token_counts: a dictionary of string to int. - min_val: an integer - lower bound for `min_count`. - max_val: an integer - upper bound for `min_count`. - num_iterations: an integer. how many iterations of refinement. + target_size: Desired vocab_size to approximate. + token_counts: A dictionary of token counts, mapping string to int. + min_val: An integer; lower bound for the minimum token count. + max_val: An integer; upper bound for the minimum token count. + num_iterations: An integer; how many iterations of refinement. Returns: - a SubwordTextEncoder instance. + A SubwordTextEncoder instance. + + Raises: + ValueError: If `min_val` is greater than `max_val`. """ + if min_val > max_val: + raise ValueError( + "Lower bound for the minimum token count " + "is greater than the upper bound.") + def bisect(min_val, max_val): """Bisection to find the right size.""" present_count = (max_val + min_val) // 2 tf.logging.info("Trying min_count %d" % present_count) subtokenizer = cls() - subtokenizer.build_from_token_counts(token_counts, - present_count, num_iterations) - if min_val >= max_val or subtokenizer.vocab_size == target_size: + subtokenizer.build_from_token_counts( + token_counts, present_count, num_iterations) + + # If min_val == max_val, we can't do any better than this. + if subtokenizer.vocab_size == target_size or min_val == max_val: return subtokenizer if subtokenizer.vocab_size > target_size: @@ -382,34 +443,27 @@ def build_from_token_counts(self, num_iterations: an integer. how many iterations of refinement. num_reserved_ids: an integer. how many ids to reserve for special tokens. """ - # first determine the alphabet to include all characters with count at - # least min_count in the dataset. - char_counts = defaultdict(int) - for token, count in six.iteritems(token_counts): - for c in token: - char_counts[c] += count - self._alphabet = set() - for c, count in six.iteritems(char_counts): - if count >= min_count: - self._alphabet.add(c) - # Make sure all characters needed for escaping are included - for c in u"\\_;0123456789": - self._alphabet.add(c) + self._init_alphabet_from_tokens(six.iterkeys(token_counts)) + + # Bootstrap the initial list of subtokens with the characters from the + # alphabet plus the escaping characters. + self._init_subtokens_from_list( + list(self._alphabet), reserved=num_reserved_ids) # We build iteratively. On each iteration, we segment all the words, # then count the resulting potential subtokens, keeping the ones # with high enough counts for our new vocabulary. if min_count < 1: min_count = 1 - for i in xrange(num_iterations): + for i in six.moves.range(num_iterations): tf.logging.info("Iteration {0}".format(i)) - counts = defaultdict(int) + counts = collections.defaultdict(int) for token, count in six.iteritems(token_counts): - escaped_token = self._escape_token(token) + escaped_token = _escape_token(token, self._alphabet) # we will count all tails of the escaped_token, starting from boundaries # determined by our current segmentation. if i == 0: - starts = xrange(len(escaped_token)) + starts = six.moves.range(len(escaped_token)) else: subtokens = self._escaped_token_to_subtokens(escaped_token) pos = 0 @@ -418,48 +472,43 @@ def build_from_token_counts(self, starts.append(pos) pos += len(self._all_subtoken_strings[subtoken]) for start in starts: - for end in xrange(start + 1, len(escaped_token) + 1): + for end in six.moves.range(start + 1, len(escaped_token) + 1): subtoken_string = escaped_token[start:end] counts[subtoken_string] += count - # Make sure all characters needed for escaping are included - for c in self._alphabet: - counts[c] += min_count # Array of sets of candidate subtoken strings, by length len_to_subtoken_strings = [] for subtoken_string, count in six.iteritems(counts): lsub = len(subtoken_string) - if count >= min_count: + # Always include all the alphabet characters or some strings will + # be unencodeable. + if count >= min_count or subtoken_string in self._alphabet: # Add this subtoken string to its length set while len(len_to_subtoken_strings) <= lsub: len_to_subtoken_strings.append(set()) len_to_subtoken_strings[lsub].add(subtoken_string) new_subtoken_strings = [] - # consider the candidates longest to shortest, so that if we accept + # Consider the candidates longest to shortest, so that if we accept # a longer subtoken string, we can decrement the counts of its prefixes. - for lsub in reversed(range(1, len(len_to_subtoken_strings))): + for lsub in six.moves.range(len(len_to_subtoken_strings)-1, 0, -1): subtoken_strings = len_to_subtoken_strings[lsub] for subtoken_string in subtoken_strings: count = counts[subtoken_string] - if count >= min_count: - new_subtoken_strings.append((count, subtoken_string)) - for l in xrange(1, lsub): + if count >= min_count or subtoken_string in self._alphabet: + # Exclude alphabet tokens here, as they must be included later + # explicitly, regardless of count. + if subtoken_string not in self._alphabet: + new_subtoken_strings.append((count, subtoken_string)) + for l in six.moves.range(1, lsub): counts[subtoken_string[:l]] -= count - # Sort in decreasing order by count new_subtoken_strings.sort(reverse=True) - # Now we have a candidate vocabulary - old_alphabet = self._alphabet - self._init_from_list([u""] * num_reserved_ids + - [p[1] for p in new_subtoken_strings]) - assert old_alphabet == self._alphabet - tf.logging.info("vocab_size = %d" % self.vocab_size) - original = "This sentence was encoded by the SubwordTextEncoder." - encoded = self.encode(original) - print(encoded) - print([self._subtoken_to_subtoken_string(s) for s in encoded]) - decoded = self.decode(encoded) - print(decoded) - assert decoded == original + # Reinitialize to the candidate vocabulary, including the alphabet + # explicitly as the highest priority. + self._init_subtokens_from_list( + list(self._alphabet) + + [subtoken for _, subtoken in new_subtoken_strings], + reserved=num_reserved_ids) + tf.logging.info("vocab_size = %d" % self.vocab_size) def dump(self): """Debugging dump of the current subtoken vocabulary.""" @@ -468,15 +517,21 @@ def dump(self): print(u", ".join(u"{0} : '{1}'".format(i, s) for i, s in sorted(subtoken_strings))) - def _init_from_list(self, subtoken_strings): - """Initialize from a list of subtoken strings.""" - self._all_subtoken_strings = subtoken_strings + def _init_subtokens_from_list(self, subtoken_strings, reserved=0): + """Initialize token information from a list of subtoken strings.""" + self._all_subtoken_strings = [u""] * reserved + subtoken_strings # we remember the maximum length of any subtoken to avoid having to # check arbitrarily long strings. self._max_subtoken_len = max([len(s) for s in subtoken_strings]) self._subtoken_string_to_id = { - s: i for i, s in enumerate(subtoken_strings) if s} - self._alphabet = set([c for c in subtoken_strings if len(c) == 1]) + s: i+reserved for i, s in enumerate(subtoken_strings) if s} + + def _init_alphabet_from_tokens(self, tokens): + """Initialize alphabet from an iterable of token or subtoken strings.""" + # Include all characters from all tokens in the alphabet to guarantee that + # any token can be encoded. Additionally, include all escaping characters. + self._alphabet = {c for token in tokens for c in token} + self._alphabet |= _ESCAPE_CHARS def _load_from_file(self, filename): """Load from a file.""" @@ -484,51 +539,10 @@ def _load_from_file(self, filename): with tf.gfile.Open(filename) as f: for line in f: subtoken_strings.append(native_to_unicode(line.strip()[1:-1])) - self._init_from_list(subtoken_strings) + self._init_subtokens_from_list(subtoken_strings) + self._init_alphabet_from_tokens(subtoken_strings) def store_to_file(self, filename): with tf.gfile.Open(filename, "w") as f: for subtoken_string in self._all_subtoken_strings: f.write("'" + unicode_to_native(subtoken_string) + "'\n") - - def _escape_token(self, token): - """Escape away underscores and OOV characters and append '_'. - - This allows the token to be experessed as the concatenation of a list - of subtokens from the vocabulary. The underscore acts as a sentinel - which allows us to invertibly concatenate multiple such lists. - - Args: - token: a unicode string - Returns: - escaped_token: a unicode string - """ - assert isinstance(token, six.text_type) - token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u") + u"_" - ret = u"" - for c in token: - if c in self._alphabet and c != u"\n": - ret += c - else: - ret += u"\\%d;" % ord(c) - return ret - - def _unescape_token(self, escaped_token): - """Inverse of _escape_token(). - - Args: - escaped_token: a unicode string - Returns: - token: a unicode string - """ - def match(m): - if m.group(1) is not None: - # Convert '\213;' to unichr(213) - try: - return unichr(int(m.group(1))) - except (ValueError, OverflowError) as _: - return "" - # Convert '\u' to '_' and '\\' to '\' - return u"_" if m.group(0) == u"\\u" else u"\\" - # Cut off the trailing underscore and apply the regex substitution - return _UNESCAPE_REGEX.sub(match, escaped_token[:-1]) diff --git a/tensor2tensor/data_generators/text_encoder_test.py b/tensor2tensor/data_generators/text_encoder_test.py new file mode 100644 index 000000000..7ac2ba911 --- /dev/null +++ b/tensor2tensor/data_generators/text_encoder_test.py @@ -0,0 +1,68 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensor2tensor.data_generators.text_encoder.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.data_generators import text_encoder +import tensorflow as tf + + +class EscapeUnescapeTokenTest(tf.test.TestCase): + + def test_escape_token(self): + escaped = text_encoder._escape_token( + u'Foo! Bar.\nunder_score back\\slash', + set('abcdefghijklmnopqrstuvwxyz .\n') | text_encoder._ESCAPE_CHARS) + + self.assertEqual( + u'\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_', escaped) + + def test_unescape_token(self): + unescaped = text_encoder._unescape_token( + u'\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_') + + self.assertEqual( + u'Foo! Bar.\nunder_score back\\slash', unescaped) + + +class SubwordTextEncoderTest(tf.test.TestCase): + + def test_encode_decode(self): + token_counts = { + u'this': 9, + u'sentence': 14, + u'the': 100, + u'encoded': 1, + u'was': 20, + u'by': 50, + } + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( + 50, token_counts, 2, 10) + encoder.build_from_token_counts(token_counts, min_count=2) + + original = 'This sentence was encoded by the SubwordTextEncoder.' + encoded = encoder.encode(original) + decoded = encoder.decode(encoded) + self.assertEqual(original, decoded) + + +if __name__ == '__main__': + tf.test.main() From 83a757dd43df099b3c545e5cd2e9f9f9f0aed50b Mon Sep 17 00:00:00 2001 From: Ashish Vaswani Date: Tue, 25 Jul 2017 12:04:03 -0700 Subject: [PATCH 08/21] Adding encoder conv attention. A query block attends to a neighborhood to the left and the right of it. Pair programmed (Ashish + Niki) PiperOrigin-RevId: 163103460 --- tensor2tensor/models/common_attention.py | 97 +++++++++++++++++++++++- 1 file changed, 93 insertions(+), 4 deletions(-) diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py index 624623f4c..98a198f85 100644 --- a/tensor2tensor/models/common_attention.py +++ b/tensor2tensor/models/common_attention.py @@ -435,6 +435,91 @@ def local(x): return output +def unmasked_local_attention_1d(q, k, v, block_length=128, filter_width=100, + name=None): + """strided block local self-attention. + + Args: + q: a Tensor with shape [batch, heads, length, depth_k] + k: a Tensor with shape [batch, heads, length, depth_k] + v: a Tensor with shape [batch, heads, length, depth_v] + block_length: an integer + filter_width: an integer indicating how much to look left. + name: an optional string + + Returns: + a Tensor of shape [batch, heads, length, depth_v] + """ + with tf.variable_scope(name, default_name="local_self_attention_1d", + values=[q, k, v]): + v_shape = v.get_shape() + depth_v = tf.shape(v)[3] + batch_size = tf.shape(q)[0] + num_heads = tf.shape(q)[1] + original_length = tf.shape(q)[2] + # making sure q is a multiple of d + def pad_to_multiple(x, pad_length): + x_length = tf.shape(x)[2] + return tf.pad(x, [[0, 0], [0, 0], [0, -x_length % pad_length], [0, 0]]) + def pad_l_and_r(x, pad_length): + return tf.pad(x, [[0, 0], [0, 0], [pad_length, pad_length], [0, 0]]) + q = pad_to_multiple(q, block_length) + k = pad_to_multiple(k, block_length) + v = pad_to_multiple(v, block_length) + + # Setting up q blocks + new_q_shape = tf.shape(q) + # Setting up q blocks + q = tf.reshape(q, [new_q_shape[0], new_q_shape[1], + new_q_shape[2]//block_length, + block_length, new_q_shape[3]]) + + # Setting up k and v values + k = pad_l_and_r(k, filter_width) + v = pad_l_and_r(v, filter_width) + + length = tf.shape(k)[2] + full_filter_width = block_length + 2*filter_width + # getting gather indices + indices = tf.range(0, length, delta=1, name="index_range") + # making indices [1, length, 1] to appy convs + indices = tf.reshape(indices, [1, -1, 1]) + kernel = tf.expand_dims(tf.eye(full_filter_width), axis=1) + gather_indices = tf.nn.conv1d( + tf.cast(indices, tf.float32), + kernel, + block_length, + padding="VALID", + name="gather_conv") + + gather_indices = tf.squeeze(tf.cast(gather_indices, tf.int32), axis=0) + + # [length, batch, heads, dim] + k_t = tf.transpose(k, [2, 0, 1, 3]) + k_new = tf.gather(k_t, gather_indices) + + # [batch, heads, blocks, block_length, dim] + k_new = tf.transpose(k_new, [2, 3, 0, 1, 4]) + + attention_bias = tf.expand_dims( + tf.to_float(embedding_to_padding(k_new)) * -1e9, axis=-2) + + v_t = tf.transpose(v, [2, 0, 1, 3]) + v_new = tf.gather(v_t, gather_indices) + v_new = tf.transpose(v_new, [2, 3, 0, 1, 4]) + + logits = tf.matmul(q, k_new, transpose_b=True) + + attention = tf.nn.softmax(logits+attention_bias) + output = tf.matmul(attention, v_new) + + output = tf.reshape(output, [batch_size, num_heads, -1, depth_v]) + # Remove the padding if introduced + output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1]) + output.set_shape(v_shape) + return output + + def multihead_attention(query_antecedent, memory_antecedent, bias, @@ -460,8 +545,9 @@ def multihead_attention(query_antecedent, dropout_rate: a floating point number image_shapes: optional tuple of integer scalars. see comments for attention_image_summary() - attention_type: a string, either "dot_product" or "local_mask_right" - block_length: an integer - relevent for "local_mask_right" + attention_type: a string, either "dot_product" or "local_mask_right" or + "local_unmasked" + block_length: an integer - relevant for "local_mask_right" name: an optional string Returns: @@ -509,9 +595,11 @@ def multihead_attention(query_antecedent, if attention_type == "dot_product": x = dot_product_attention( q, k, v, bias, dropout_rate, image_shapes) - else: - assert attention_type == "local_mask_right" + elif attention_type == "local_mask_right": x = masked_local_attention_1d(q, k, v, block_length=block_length) + else: + assert attention_type == "local_unmasked" + x = unmasked_local_attention_1d(q, k, v, block_length=block_length) x = combine_heads(x) x = common_layers.conv1d(x, output_depth, 1, name="output_transform") return x @@ -652,4 +740,5 @@ def parameter_attention(x, y = tf.reshape(y, [batch_size, length, total_value_depth]) y.set_shape([None, None, total_value_depth]) y = common_layers.conv1d(y, output_depth, 1, name="output_transform") + return y From d190b79861d849569e42d8ad892b337983df39eb Mon Sep 17 00:00:00 2001 From: Noam Shazeer Date: Tue, 25 Jul 2017 15:20:01 -0700 Subject: [PATCH 09/21] Update inspect.py to allow decoding with TokenTextEncoder and ByteTextEncoder. PiperOrigin-RevId: 163131045 --- tensor2tensor/data_generators/inspect.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tensor2tensor/data_generators/inspect.py b/tensor2tensor/data_generators/inspect.py index 124c07017..6ba054d3c 100644 --- a/tensor2tensor/data_generators/inspect.py +++ b/tensor2tensor/data_generators/inspect.py @@ -34,6 +34,10 @@ tf.app.flags.DEFINE_string("subword_text_encoder_filename", "", "SubwordTextEncoder vocabulary file") +tf.app.flags.DEFINE_string("token_text_encoder_filename", "", + "TokenTextEncoder vocabulary file") +tf.app.flags.DEFINE_bool("byte_text_encoder", False, + "use a ByteTextEncoder") tf.app.flags.DEFINE_string("input_filename", "", "input filename") tf.app.flags.DEFINE_bool("print_inputs", False, "Print decoded inputs to stdout") @@ -48,6 +52,11 @@ def main(_): if FLAGS.subword_text_encoder_filename: encoder = text_encoder.SubwordTextEncoder( FLAGS.subword_text_encoder_filename) + elif FLAGS.token_text_encoder_filename: + encoder = text_encoder.TokenTextEncoder( + FLAGS.token_text_encoder_filename) + elif FLAGS.byte_text_encoder: + encoder = text_encoder.ByteTextEncoder() else: encoder = None reader = tf.python_io.tf_record_iterator(FLAGS.input_filename) From 7de63bd1dac3482d6c2388b715b958d3726870c9 Mon Sep 17 00:00:00 2001 From: Noam Shazeer Date: Tue, 25 Jul 2017 15:58:10 -0700 Subject: [PATCH 10/21] Character-level version of lm1b. PiperOrigin-RevId: 163136520 --- tensor2tensor/bin/t2t-datagen | 4 ++++ tensor2tensor/data_generators/lm1b.py | 10 +++++++--- .../data_generators/problem_hparams.py | 17 +++++++++++++++++ tensor2tensor/data_generators/text_encoder.py | 2 ++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index 783906d95..a9fa12255 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -110,6 +110,10 @@ _SUPPORTED_PROBLEM_GENERATORS = { lambda: lm1b.generator(FLAGS.tmp_dir, True), lambda: lm1b.generator(FLAGS.tmp_dir, False) ), + "lm1b_characters": ( + lambda: lm1b.generator(FLAGS.tmp_dir, True, characters=True), + lambda: lm1b.generator(FLAGS.tmp_dir, False, characters=True) + ), "wiki_32k": ( lambda: wiki.generator(FLAGS.tmp_dir, True), 1000 diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py index 562435184..a436e0e6e 100644 --- a/tensor2tensor/data_generators/lm1b.py +++ b/tensor2tensor/data_generators/lm1b.py @@ -63,7 +63,7 @@ def _original_vocab(tmp_dir): def _replace_oov(original_vocab, line): """Replace out-of-vocab words with "UNK". - This maintains compatability with published results. + This maintains compatibility with published results. Args: original_vocab: a set of strings (The standard vocabulary for the dataset) @@ -138,12 +138,13 @@ def _get_or_build_subword_text_encoder(tmp_dir): return ret -def generator(tmp_dir, train): +def generator(tmp_dir, train, characters=False): """Generator for lm1b sentences. Args: tmp_dir: a string. train: a boolean. + characters: a boolean Yields: A dictionary {"inputs": [0], "targets": []} @@ -152,7 +153,10 @@ def generator(tmp_dir, train): original_vocab = _original_vocab(tmp_dir) files = (_train_data_filenames(tmp_dir) if train else [_dev_data_filename(tmp_dir)]) - encoder = _get_or_build_subword_text_encoder(tmp_dir) + if characters: + encoder = text_encoder.ByteTextEncoder() + else: + encoder = _get_or_build_subword_text_encoder(tmp_dir) for filepath in files: tf.logging.info("filepath = %s", filepath) for line in tf.gfile.Open(filepath): diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index 159ea6ac9..2792c79e9 100644 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -336,6 +336,22 @@ def lm1b_32k(model_hparams): return p +def lm1b_characters(unused_model_hparams): + """Billion-word language-modeling benchmark, 32k subword vocabulary.""" + p = default_problem_hparams() + # ratio of dev tokens (including eos) to dev words (including eos) + # 826189 / 159658 = 5.174742 + p.perplexity_exponent = 5.174742 + p.input_modality = {} + encoder = text_encoder.ByteTextEncoder() + p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size) + p.vocabulary = { + "targets": encoder + } + p.target_space_id = 2 + return p + + def wiki_32k(model_hparams): """Wikipedia title to article. 32k subtoken vocabulary.""" p = default_problem_hparams() @@ -623,6 +639,7 @@ def image_celeba(unused_model_hparams): "audio_wsj_characters_test": audio_wsj_characters, "audio_wsj_tokens_8k_tune": lambda p: audio_wsj_tokens(p, 2**13), "audio_wsj_tokens_8k_test": lambda p: audio_wsj_tokens(p, 2**13), + "lm1b_characters": lm1b_characters, "lm1b_32k": lm1b_32k, "wiki_32k": wiki_32k, "lmptb_10k": lmptb_10k, diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index afe1da9ae..6b01e3a35 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -128,6 +128,8 @@ class ByteTextEncoder(TextEncoder): def encode(self, s): numres = self._num_reserved_ids if six.PY2: + if isinstance(s, unicode): + s = s.encode("utf-8") return [ord(c) + numres for c in s] # Python3: explicitly convert to UTF-8 return [c + numres for c in s.encode("utf-8")] From fd1a87d214861ea8d8ec3079cd636b145aad7630 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Tue, 25 Jul 2017 17:06:12 -0700 Subject: [PATCH 11/21] Problem.eval_metrics PiperOrigin-RevId: 163145397 --- tensor2tensor/data_generators/genetics.py | 7 +- tensor2tensor/data_generators/problem.py | 18 ++++ tensor2tensor/data_generators/wmt.py | 15 +++ tensor2tensor/utils/metrics.py | 109 ++++++++++++++-------- tensor2tensor/utils/t2t_model.py | 4 +- tensor2tensor/utils/trainer_utils.py | 26 ++++-- 6 files changed, 129 insertions(+), 50 deletions(-) diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py index 848c2341b..309580d53 100644 --- a/tensor2tensor/data_generators/genetics.py +++ b/tensor2tensor/data_generators/genetics.py @@ -49,6 +49,7 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.utils import metrics from tensor2tensor.utils import registry import tensorflow as tf @@ -141,7 +142,8 @@ def hparams(self, defaults, model_hparams): p.target_space_id = problem.SpaceID.REAL def example_reading_spec(self): - # TODO(rsepassi): propagate and apply targets_mask to output RealModality. + # TODO(rsepassi): propagate and apply targets_mask to output RealModality + # and to eval metrics (weights_fn?). data_fields = { "inputs": tf.VarLenFeature(tf.int64), "targets_mask": tf.VarLenFeature(tf.float32), @@ -158,6 +160,9 @@ def preprocess_examples(self, examples, mode): return examples + def eval_metrics(self): + return [metrics.Metrics.RMSE] + @registry.register_problem("genetics_cage10") class GeneticsCAGE10(GeneExpressionProblem): diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 22b6214e6..69d81e58e 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -22,6 +22,7 @@ from tensor2tensor.data_generators import generator_utils as utils from tensor2tensor.data_generators import text_encoder +from tensor2tensor.utils import metrics import tensorflow as tf @@ -111,6 +112,17 @@ class Problem(object): * hparams(defaults, model_hparams) - Specify the problem hyperparameters (see _default_hparams) - Mutate defaults as needed + * example_reading_spec + - Specify the names and types of the features on disk. + - Specify tf.contrib.slim.tfexample_decoder + * preprocess_examples(examples, mode) + - Preprocess the example feature dict from feature name to Tensor or + SparseTensor. + - Used in training, eval, and inference (specified by mode). + + Eval: + * eval_metrics + - Specify the set of evaluation metrics for this problem. Inference: * feature_encoders(data_dir) @@ -151,6 +163,12 @@ def preprocess_examples(self, examples, mode): del mode return examples + def eval_metrics(self): + return [ + metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, + metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY + ] + # ============================================================================ # END SUBCLASS INTERFACE # ============================================================================ diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 50125ccd1..519d55996 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -28,6 +28,7 @@ from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder from tensor2tensor.data_generators import wsj_parsing +from tensor2tensor.utils import metrics from tensor2tensor.utils import registry import tensorflow as tf @@ -120,6 +121,13 @@ def hparams(self, defaults, unused_model_hparams): if self.is_character_level: p.loss_multiplier = 2.0 + def eval_metrics(self): + return [ + metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, + metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY, + metrics.Metrics.APPROX_BLEU + ] + # Generic generators used later for multiple problems. @@ -658,6 +666,13 @@ def input_space_id(self): def target_space_id(self): return problem.SpaceID.CS_TOK + def eval_metrics(self): + return [ + metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, + metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY, + metrics.Metrics.APPROX_BLEU + ] + @registry.register_problem("wmt_encs_characters") class WMTEnCsCharacters(WMTProblem): diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py index 118e33394..29f44b574 100644 --- a/tensor2tensor/utils/metrics.py +++ b/tensor2tensor/utils/metrics.py @@ -18,8 +18,6 @@ from __future__ import division from __future__ import print_function -import functools - # Dependency imports import six @@ -29,7 +27,24 @@ import tensorflow as tf -FLAGS = tf.flags.FLAGS + +class Metrics(object): + """Available evaluation metrics.""" + # Entries here should match the keys in METRICS_FN below + ACC = "accuracy" + ACC_TOP5 = "accuracy_top5" + ACC_PER_SEQ = "accuracy_per_sequence" + NEG_LOG_PERPLEXITY = "neg_log_perplexity" + APPROX_BLEU = "approx_bleu_score" + RMSE = "rmse" + + +def padded_rmse(predictions, labels, weights_fn=common_layers.weights_all): + predictions, labels = common_layers.pad_with_zeros(predictions, labels) + targets = labels + weights = weights_fn(targets) + error = tf.sqrt(tf.pow(predictions - labels, 2)) + return tf.reduce_sum(error * weights), tf.reduce_sum(weights) def padded_accuracy_topk(predictions, @@ -98,62 +113,76 @@ def create_evaluation_metrics(problems): """Creates the evaluation metrics for the model. Args: - problems: List of strings containing the name of the problems. + problems: List of tuples (problem name, problem instance). Returns: A dictionary with keys that are strings naming the evaluation metrics and values that are functions taking arguments of (predictions, targets), returning a tuple of a tensor of the metric's value together with an op to update the metric's value. + + Raises: + ValueError: if the metrics specified by a problem are not recognized (i.e. + are not defined in the Metrics enum. """ - def append_metric_fns(metric_tup, eval_metrics): - """Append problem-specific and global metrics to eval_metrics.""" - metric_name, metric_function = metric_tup - def fn(predictions, labels, weights, idx, weights_fn): - # The 'weights' argument represents problem-choice here, - # we need to keep this name because MetricSpecs checks it. + def make_problem_specific_metric_fn(metric_fn, problem_idx, weights_fn): + """Create a metric fn conditioned on problem_idx.""" + + def problem_metric_fn(predictions, labels, weights): problem_choice = weights (scores, weights) = tf.cond( - tf.equal(idx, problem_choice), # pylint: disable=cell-var-from-loop - lambda: metric_function(predictions, labels, weights_fn=weights_fn), + tf.equal(problem_idx, problem_choice), + lambda: metric_fn(predictions, labels, weights_fn=weights_fn), lambda: (tf.constant(0.0), tf.constant(0.0))) # The tf.metrics.mean function assures correct aggregation. return tf.metrics.mean(scores, weights) - for i, problem in enumerate(problems): - name = "metrics-%s/%s" % (problem, metric_name) - class_output = "image" in problem and "coco" not in problem - weights_fn = (common_layers.weights_all if class_output - else common_layers.weights_nonzero) - eval_metrics[name] = functools.partial(fn, idx=i, weights_fn=weights_fn) - - def global_fn(predictions, labels, weights): - (scores, weights) = metric_function(predictions, labels) - return tf.metrics.mean(scores, weights) - - eval_metrics["metrics/%s" % metric_name] = global_fn + return problem_metric_fn eval_metrics = dict() - - # Metrics are functions that take predictions and labels and return - # a tensor of metrics and a tensor of weights. - # The results are passed to tf.metrics.mean to accumulate properly. - metrics_list = [("accuracy", padded_accuracy), ("accuracy_top5", - padded_accuracy_top5), - ("accuracy_per_sequence", padded_sequence_accuracy), - ("neg_log_perplexity", padded_neg_log_perplexity)] - - # TODO(nikip): Extend this to support use of custom metrics for problems. - for problem in problems: - if "wmt" in problem: - metrics_list.append(("approx_bleu_score", bleu_hook.bleu_score)) - - for metric in metrics_list: - append_metric_fns(metric, eval_metrics) + for problem_idx, (problem_name, problem_instance) in enumerate(problems): + if problem_instance is None: + # For problems in problem_hparams + metrics = [ + Metrics.ACC, Metrics.ACC_TOP5, Metrics.ACC_PER_SEQ, + Metrics.NEG_LOG_PERPLEXITY + ] + if "wmt" in problem_name: + metrics.append(Metrics.APPROX_BLEU) + else: + # For registered Problems + metrics = problem_instance.eval_metrics() + if not all([m in METRICS_FNS for m in metrics]): + raise ValueError("Unrecognized metric. Problem %s specified metrics " + "%s. Recognized metrics are %s." % + (problem_name, metrics, METRICS_FNS.keys())) + + class_output = "image" in problem_name and "coco" not in problem_name + weights_fn = (common_layers.weights_all + if class_output else common_layers.weights_nonzero) + + for metric in metrics: + metric_fn = METRICS_FNS[metric] + problem_metric_fn = make_problem_specific_metric_fn( + metric_fn, problem_idx, weights_fn) + eval_metrics["metrics-%s/%s" % (problem_name, metric)] = problem_metric_fn return { k: tf.contrib.learn.MetricSpec( v, prediction_key="predictions", weight_key="problem_choice") for (k, v) in six.iteritems(eval_metrics) } + + +# Metrics are functions that take predictions and labels and return +# a tensor of metrics and a tensor of weights. +# The results are passed to tf.metrics.mean to accumulate properly. +METRICS_FNS = { + Metrics.ACC: padded_accuracy, + Metrics.ACC_TOP5: padded_accuracy_top5, + Metrics.ACC_PER_SEQ: padded_sequence_accuracy, + Metrics.NEG_LOG_PERPLEXITY: padded_neg_log_perplexity, + Metrics.APPROX_BLEU: bleu_hook.bleu_score, + Metrics.RMSE: padded_rmse, +} diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 9777568fc..66e40d495 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -164,6 +164,8 @@ def infer(self, Returns: samples: an integer `Tensor`. """ + # TODO(rsepassi): Make decoding work with real-valued model outputs + # (i.e. if the target modality is RealModality). if not self.has_input: # since there is no input, it is more interesting to see randomly # generated sequences, than to see the most likely sequence repeatedly. @@ -500,5 +502,5 @@ def _warn_changed_modality_type(new_name, old_name, feature_name): old_type, old_name = registry.parse_modality_name(old_name) if new_type != old_type: tf.logging.warning("%s has a designated modality type %s (%s) but has been " - "overriden with a modality of type %s (%s).", + "overridden with a modality of type %s (%s).", feature_name, old_type, old_name, new_type, new_name) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 0943881f3..bf42c36cc 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -85,7 +85,7 @@ flags.DEFINE_integer("local_eval_frequency", 2000, "Run evaluation every this steps during local training.") flags.DEFINE_bool("locally_shard_to_cpu", False, - "Use CPU as a sharding device runnning locally. This allows " + "Use CPU as a sharding device running locally. This allows " "to test sharded model construction on a machine with 1 GPU.") flags.DEFINE_bool("daisy_chain_variables", True, "copy variables around in a daisy chain") @@ -103,6 +103,9 @@ flags.DEFINE_integer("ps_replicas", 0, "How many ps replicas.") # Decode flags +# Set one of {decode_from_dataset, decode_interactive, decode_from_file} to +# decode. +flags.DEFINE_bool("decode_from_dataset", False, "Decode from dataset on disk.") flags.DEFINE_bool("decode_use_last_position_only", False, "In inference, use last position only for speedup.") flags.DEFINE_bool("decode_interactive", False, @@ -152,17 +155,24 @@ def experiment_fn(output_dir): def create_experiment(output_dir, data_dir, model_name, train_steps, eval_steps): + """Create Experiment.""" hparams = create_hparams(FLAGS.hparams_set, data_dir) estimator, input_fns = create_experiment_components( hparams=hparams, output_dir=output_dir, data_dir=data_dir, model_name=model_name) + eval_metrics = metrics.create_evaluation_metrics( + zip(FLAGS.problems.split("-"), hparams.problem_instances)) + if ("autotune" in FLAGS and FLAGS.autotune and + FLAGS.objective not in eval_metrics): + raise ValueError("Tuning objective %s not among evaluation metrics %s" % + (FLAGS.objective, eval_metrics.keys())) return tf.contrib.learn.Experiment( estimator=estimator, train_input_fn=input_fns["train"], eval_input_fn=input_fns["eval"], - eval_metrics=metrics.create_evaluation_metrics(FLAGS.problems.split("-")), + eval_metrics=eval_metrics, train_steps=train_steps, eval_steps=eval_steps, min_eval_frequency=FLAGS.local_eval_frequency, @@ -585,18 +595,18 @@ def run_locally(exp): Args: exp: Experiment. """ - if exp.train_steps > 0: - # Train - tf.logging.info("Performing local training.") + if exp.train_steps > 0 or exp.eval_steps > 0: + tf.logging.info("Performing local training and evaluation.") exp.train_and_evaluate() + decode(exp.estimator) - # Predict - estimator = exp.estimator + +def decode(estimator): if FLAGS.decode_interactive: decode_interactively(estimator) elif FLAGS.decode_from_file is not None: decode_from_file(estimator, FLAGS.decode_from_file) - else: + elif FLAGS.decode_from_dataset: decode_from_dataset(estimator) From ca08ad9bf1ec957646a17eda089d3b530fb77d93 Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Tue, 25 Jul 2017 17:26:22 -0700 Subject: [PATCH 12/21] Un-reorder of timing signals to make trained models work. PiperOrigin-RevId: 163147659 --- tensor2tensor/models/common_attention.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py index 98a198f85..1a8b2c79d 100644 --- a/tensor2tensor/models/common_attention.py +++ b/tensor2tensor/models/common_attention.py @@ -66,9 +66,6 @@ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) - signal = tf.reshape(signal, [length, 2, num_timescales]) - signal = tf.transpose(signal, perm=[0, 2, 1]) - signal = tf.reshape(signal, [length, channels]) signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]]) signal = tf.reshape(signal, [1, length, channels]) return x + signal From b9fcd66f14ecded01cc257932655f5b1f493e3b9 Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Tue, 25 Jul 2017 18:12:16 -0700 Subject: [PATCH 13/21] Back to wmt16 on one set not downloadable from wmt17, internal merges. PiperOrigin-RevId: 163152415 --- tensor2tensor/bin/t2t-datagen | 1 + tensor2tensor/data_generators/text_encoder.py | 9 +++++---- tensor2tensor/data_generators/wmt.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index a9fa12255..629014713 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -281,6 +281,7 @@ def generate_data_for_problem(problem): def generate_data_for_registered_problem(problem_name): + tf.logging.info("Generating training data for %s.", problem_name) problem = registry.problem(problem_name) task_id = None if FLAGS.task_id < 0 else FLAGS.task_id problem.generate_data(os.path.expanduser(FLAGS.data_dir), diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 6b01e3a35..9fc9eed88 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -30,6 +30,7 @@ # Dependency imports import six +from six.moves import xrange # pylint: disable=redefined-builtin from tensor2tensor.data_generators import tokenizer import tensorflow as tf @@ -457,7 +458,7 @@ def build_from_token_counts(self, # with high enough counts for our new vocabulary. if min_count < 1: min_count = 1 - for i in six.moves.range(num_iterations): + for i in xrange(num_iterations): tf.logging.info("Iteration {0}".format(i)) counts = collections.defaultdict(int) for token, count in six.iteritems(token_counts): @@ -474,7 +475,7 @@ def build_from_token_counts(self, starts.append(pos) pos += len(self._all_subtoken_strings[subtoken]) for start in starts: - for end in six.moves.range(start + 1, len(escaped_token) + 1): + for end in xrange(start + 1, len(escaped_token) + 1): subtoken_string = escaped_token[start:end] counts[subtoken_string] += count # Array of sets of candidate subtoken strings, by length @@ -491,7 +492,7 @@ def build_from_token_counts(self, new_subtoken_strings = [] # Consider the candidates longest to shortest, so that if we accept # a longer subtoken string, we can decrement the counts of its prefixes. - for lsub in six.moves.range(len(len_to_subtoken_strings)-1, 0, -1): + for lsub in xrange(len(len_to_subtoken_strings)-1, 0, -1): subtoken_strings = len_to_subtoken_strings[lsub] for subtoken_string in subtoken_strings: count = counts[subtoken_string] @@ -500,7 +501,7 @@ def build_from_token_counts(self, # explicitly, regardless of count. if subtoken_string not in self._alphabet: new_subtoken_strings.append((count, subtoken_string)) - for l in six.moves.range(1, lsub): + for l in xrange(1, lsub): counts[subtoken_string[:l]] -= count new_subtoken_strings.sort(reverse=True) diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 519d55996..9587d4d2a 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -259,7 +259,7 @@ def bi_vocabs_token_generator(source_path, _ENDE_TRAIN_DATASETS = [ [ - "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long + "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long ("training-parallel-nc-v11/news-commentary-v11.de-en.en", "training-parallel-nc-v11/news-commentary-v11.de-en.de") ], From 92101af0f2fbc4e16557fd688bde9cd9cc33a452 Mon Sep 17 00:00:00 2001 From: Niki Parmar Date: Wed, 26 Jul 2017 09:58:41 -0700 Subject: [PATCH 14/21] Bug fix, specify axis for squeeze when computing BLEU score PiperOrigin-RevId: 163219501 --- tensor2tensor/utils/bleu_hook.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py index 06d62ad1e..20a7c8426 100644 --- a/tensor2tensor/utils/bleu_hook.py +++ b/tensor2tensor/utils/bleu_hook.py @@ -92,7 +92,6 @@ def compute_bleu(reference_corpus, matches_by_order[len(ngram) - 1] += overlap[ngram] for ngram in translation_ngram_counts: possible_matches_by_order[len(ngram)-1] += translation_ngram_counts[ngram] - precisions = [0] * max_order for i in xrange(0, max_order): if possible_matches_by_order[i] > 0: @@ -107,7 +106,6 @@ def compute_bleu(reference_corpus, if use_bp: ratio = translation_length / reference_length bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0 - bleu = geo_mean * bp return np.float32(bleu) @@ -128,8 +126,8 @@ def bleu_score(predictions, labels, **unused_kwargs): """ outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) # Convert the outputs and labels to a [batch_size, input_length] tensor. - outputs = tf.squeeze(outputs) - labels = tf.squeeze(labels) + outputs = tf.squeeze(outputs, axis=[-1, -2]) + labels = tf.squeeze(labels, axis=[-1, -2]) bleu = tf.py_func(compute_bleu, (labels, outputs), tf.float32) return bleu, tf.constant(1.0) From 28eb48f9d1799fbe83ae54c88c02fa4301f97120 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 26 Jul 2017 12:42:01 -0700 Subject: [PATCH 15/21] Limit number of concurrent processes in GeneExpressionProblem PiperOrigin-RevId: 163241281 --- tensor2tensor/data_generators/genetics.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py index 309580d53..88b82cb49 100644 --- a/tensor2tensor/data_generators/genetics.py +++ b/tensor2tensor/data_generators/genetics.py @@ -36,6 +36,7 @@ from __future__ import print_function import itertools +import math import multiprocessing as mp import os @@ -54,6 +55,7 @@ import tensorflow as tf +MAX_CONCURRENT_PROCESSES = 10 _bases = list("ACTG") @@ -122,12 +124,19 @@ def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): start_idx, end_idx)) processes.append(p) - # Start and wait for processes + # Start and wait for processes in batches assert len(processes) == num_shards + 2 # 1 per training shard + dev + test - for p in processes: - p.start() - for p in processes: - p.join() + + num_batches = int( + math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES)) + for i in xrange(num_batches): + start = i * MAX_CONCURRENT_PROCESSES + end = start + MAX_CONCURRENT_PROCESSES + current = processes[start:end] + for p in current: + p.start() + for p in current: + p.join() # Shuffle generator_utils.shuffle_dataset(all_filepaths) From cff9f4367095e62b415637d6fb839db7bdc8a28d Mon Sep 17 00:00:00 2001 From: T2T Team Date: Wed, 26 Jul 2017 13:52:47 -0700 Subject: [PATCH 16/21] Allow building a subword vocab from a word vocab file and add tests. PiperOrigin-RevId: 163250427 --- .../data_generators/test_data/corpus-1.txt | 4 + .../data_generators/test_data/corpus-2.txt | 3 + .../data_generators/test_data/vocab-1.txt | 2 + .../data_generators/test_data/vocab-2.txt | 3 + tensor2tensor/data_generators/text_encoder.py | 123 ++++++++-------- .../text_encoder_build_subword.py | 36 +++-- .../data_generators/text_encoder_test.py | 107 ++++++++++++-- tensor2tensor/data_generators/tokenizer.py | 124 +++++++++++------ .../data_generators/tokenizer_test.py | 131 +++++++++++++++--- 9 files changed, 387 insertions(+), 146 deletions(-) create mode 100644 tensor2tensor/data_generators/test_data/corpus-1.txt create mode 100644 tensor2tensor/data_generators/test_data/corpus-2.txt create mode 100644 tensor2tensor/data_generators/test_data/vocab-1.txt create mode 100644 tensor2tensor/data_generators/test_data/vocab-2.txt diff --git a/tensor2tensor/data_generators/test_data/corpus-1.txt b/tensor2tensor/data_generators/test_data/corpus-1.txt new file mode 100644 index 000000000..c05e47f90 --- /dev/null +++ b/tensor2tensor/data_generators/test_data/corpus-1.txt @@ -0,0 +1,4 @@ +One morning I shot an elephant in my pajamas. How he got in my pajamas, I don't +know. + +Groucho Marx diff --git a/tensor2tensor/data_generators/test_data/corpus-2.txt b/tensor2tensor/data_generators/test_data/corpus-2.txt new file mode 100644 index 000000000..f45577c4b --- /dev/null +++ b/tensor2tensor/data_generators/test_data/corpus-2.txt @@ -0,0 +1,3 @@ +I haven't slept for 10 days... because that would be too long. + +Mitch Hedberg diff --git a/tensor2tensor/data_generators/test_data/vocab-1.txt b/tensor2tensor/data_generators/test_data/vocab-1.txt new file mode 100644 index 000000000..d34d3d957 --- /dev/null +++ b/tensor2tensor/data_generators/test_data/vocab-1.txt @@ -0,0 +1,2 @@ +lollipop,8 +reverberated,12 diff --git a/tensor2tensor/data_generators/test_data/vocab-2.txt b/tensor2tensor/data_generators/test_data/vocab-2.txt new file mode 100644 index 000000000..7793af4f6 --- /dev/null +++ b/tensor2tensor/data_generators/test_data/vocab-2.txt @@ -0,0 +1,3 @@ +kattywampus,11 +balderdash,10 +jiggery-pokery,14 diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 9fc9eed88..69d29779a 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -30,11 +30,11 @@ # Dependency imports import six -from six.moves import xrange # pylint: disable=redefined-builtin from tensor2tensor.data_generators import tokenizer import tensorflow as tf +xrange = six.moves.xrange # pylint: disable=redefined-builtin # Reserved tokens for things like padding and EOS symbols. PAD = "" @@ -295,7 +295,7 @@ def encode(self, raw_text): Returns: a list of integers in the range [0, vocab_size) """ - return self._tokens_to_subtokens(tokenizer.encode( + return self._tokens_to_subtoken_ids(tokenizer.encode( native_to_unicode(raw_text))) def decode(self, subtokens): @@ -307,14 +307,14 @@ def decode(self, subtokens): a native string """ return unicode_to_native(tokenizer.decode( - self._subtokens_to_tokens(subtokens))) + self._subtoken_ids_to_tokens(subtokens))) @property def vocab_size(self): """The subtoken vocabulary size.""" return len(self._all_subtoken_strings) - def _tokens_to_subtokens(self, tokens): + def _tokens_to_subtoken_ids(self, tokens): """Converts a list of tokens to a list of subtoken ids. Args: @@ -324,11 +324,11 @@ def _tokens_to_subtokens(self, tokens): """ ret = [] for token in tokens: - ret.extend(self._escaped_token_to_subtokens( + ret.extend(self._escaped_token_to_subtoken_ids( _escape_token(token, self._alphabet))) return ret - def _subtokens_to_tokens(self, subtokens): + def _subtoken_ids_to_tokens(self, subtokens): """Converts a list of subtoken ids to a list of tokens. Args: @@ -337,45 +337,58 @@ def _subtokens_to_tokens(self, subtokens): a list of strings. """ concatenated = "".join( - [self._subtoken_to_subtoken_string(s) for s in subtokens]) + [self._subtoken_id_to_subtoken_string(s) for s in subtokens]) split = concatenated.split("_") return [_unescape_token(t + "_") for t in split if t] - def _subtoken_to_subtoken_string(self, subtoken): - """Subtoken_String (string) corresponding to the given subtoken (id).""" + def _subtoken_id_to_subtoken_string(self, subtoken): + """Converts a subtoken integer ID to a subtoken string.""" if 0 <= subtoken < self.vocab_size: return self._all_subtoken_strings[subtoken] return u"" - def _escaped_token_to_subtokens(self, escaped_token): - """Converts an escaped token string to a list of subtokens. + def _escaped_token_to_subtoken_strings(self, escaped_token): + """Converts an escaped token string to a list of subtoken strings. Args: - escaped_token: an escaped token + escaped_token: An escaped token as a unicode string. Returns: - a list of one or more integers. + A list of subtokens as unicode strings. """ + # NOTE: This algorithm is greedy; it won't necessarily produce the "best" + # list of subtokens. ret = [] - pos = 0 - lesc = len(escaped_token) - while pos < lesc: - end = min(lesc, pos + self._max_subtoken_len) - while end > pos: - subtoken_id = self._subtoken_string_to_id.get(escaped_token[pos:end]) - if subtoken_id is not None: + start = 0 + token_len = len(escaped_token) + while start < token_len: + for end in xrange( + min(token_len, start + self._max_subtoken_len), start, -1): + subtoken = escaped_token[start:end] + if subtoken in self._subtoken_string_to_id: + ret.append(subtoken) + start = end break - end -= 1 - # If there is no possible encoding of the escaped token then one of the - # characters in the token is not in the alphabet. This should be - # impossible and would be indicative of a bug. - assert subtoken_id is not None - - ret.append(subtoken_id) - pos = end + else: # Did not break + # If there is no possible encoding of the escaped token then one of the + # characters in the token is not in the alphabet. This should be + # impossible and would be indicative of a bug. + assert False, "Token substring not found in subtoken vocabulary." return ret + def _escaped_token_to_subtoken_ids(self, escaped_token): + """Converts an escaped token string to a list of subtoken IDs. + + Args: + escaped_token: An escaped token as a unicode string. + Returns: + A list of subtoken IDs as integers. + """ + return [ + self._subtoken_string_to_id[subtoken] + for subtoken in self._escaped_token_to_subtoken_strings(escaped_token)] + @classmethod def build_to_target_size(cls, target_size, @@ -460,55 +473,51 @@ def build_from_token_counts(self, min_count = 1 for i in xrange(num_iterations): tf.logging.info("Iteration {0}".format(i)) - counts = collections.defaultdict(int) + + # Collect all substrings of the encoded token that break along current + # subtoken boundaries. + subtoken_counts = collections.defaultdict(int) for token, count in six.iteritems(token_counts): escaped_token = _escape_token(token, self._alphabet) - # we will count all tails of the escaped_token, starting from boundaries - # determined by our current segmentation. - if i == 0: - starts = six.moves.range(len(escaped_token)) - else: - subtokens = self._escaped_token_to_subtokens(escaped_token) - pos = 0 - starts = [] - for subtoken in subtokens: - starts.append(pos) - pos += len(self._all_subtoken_strings[subtoken]) - for start in starts: + subtokens = self._escaped_token_to_subtoken_strings(escaped_token) + start = 0 + for subtoken in subtokens: for end in xrange(start + 1, len(escaped_token) + 1): - subtoken_string = escaped_token[start:end] - counts[subtoken_string] += count - # Array of sets of candidate subtoken strings, by length + new_subtoken = escaped_token[start:end] + subtoken_counts[new_subtoken] += count + start += len(subtoken) + + # Array of sets of candidate subtoken strings, by length. len_to_subtoken_strings = [] - for subtoken_string, count in six.iteritems(counts): + for subtoken_string, count in six.iteritems(subtoken_counts): lsub = len(subtoken_string) - # Always include all the alphabet characters or some strings will - # be unencodeable. - if count >= min_count or subtoken_string in self._alphabet: - # Add this subtoken string to its length set + if count >= min_count: while len(len_to_subtoken_strings) <= lsub: len_to_subtoken_strings.append(set()) len_to_subtoken_strings[lsub].add(subtoken_string) - new_subtoken_strings = [] + # Consider the candidates longest to shortest, so that if we accept # a longer subtoken string, we can decrement the counts of its prefixes. + new_subtoken_strings = [] for lsub in xrange(len(len_to_subtoken_strings)-1, 0, -1): subtoken_strings = len_to_subtoken_strings[lsub] for subtoken_string in subtoken_strings: - count = counts[subtoken_string] - if count >= min_count or subtoken_string in self._alphabet: - # Exclude alphabet tokens here, as they must be included later + count = subtoken_counts[subtoken_string] + if count >= min_count: + # Exclude alphabet tokens here, as they must be included later, # explicitly, regardless of count. if subtoken_string not in self._alphabet: new_subtoken_strings.append((count, subtoken_string)) for l in xrange(1, lsub): - counts[subtoken_string[:l]] -= count + subtoken_counts[subtoken_string[:l]] -= count + + # Include the alphabet explicitly to guarantee all strings are encodable. + new_subtoken_strings.extend( + (subtoken_counts.get(a, 0), a) for a in self._alphabet) new_subtoken_strings.sort(reverse=True) - # Reinitialize to the candidate vocabulary, including the alphabet - # explicitly as the highest priority. + # Reinitialize to the candidate vocabulary. self._init_subtokens_from_list( - list(self._alphabet) + [subtoken for _, subtoken in new_subtoken_strings], reserved=num_reserved_ids) tf.logging.info("vocab_size = %d" % self.vocab_size) diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py index a0d5d8937..88dfac116 100644 --- a/tensor2tensor/data_generators/text_encoder_build_subword.py +++ b/tensor2tensor/data_generators/text_encoder_build_subword.py @@ -39,10 +39,13 @@ import tensorflow as tf -tf.app.flags.DEFINE_string('output_fn', '/tmp/my.subword_text_encoder', +tf.app.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder', 'where to store the SubwordTextEncoder') tf.app.flags.DEFINE_string('corpus_filepattern', '', 'Corpus of one or more text files') +tf.app.flags.DEFINE_string('vocab_filepattern', '', + 'One or more vocabulary files ' + '(one word per line as "word,count")') tf.app.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus') tf.app.flags.DEFINE_integer('corpus_max_lines', 10000, 'How many lines of corpus to read') @@ -52,16 +55,27 @@ def main(unused_argv): - gs = text_encoder.SubwordTextEncoder() - if not FLAGS.corpus_filepattern: - raise ValueError('Must provide --corpus_filepattern') - token_counts = tokenizer.corpus_token_counts( - FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, - split_on_newlines=FLAGS.split_on_newlines) - gs.build_from_token_counts(token_counts, - FLAGS.min_count, - FLAGS.num_iterations) - gs.store_to_file(FLAGS.output_fn) + if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: + raise ValueError( + 'Must only provide one of --corpus_filepattern or --vocab_filepattern') + + elif FLAGS.corpus_filepattern: + token_counts = tokenizer.corpus_token_counts( + FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, + split_on_newlines=FLAGS.split_on_newlines) + + elif FLAGS.vocab_filepattern: + token_counts = tokenizer.vocab_token_counts( + FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) + + else: + raise ValueError( + 'Must provide one of --corpus_filepattern or --vocab_filepattern') + + encoder = text_encoder.SubwordTextEncoder() + encoder.build_from_token_counts( + token_counts, FLAGS.min_count, FLAGS.num_iterations) + encoder.store_to_file(FLAGS.output_fn) if __name__ == '__main__': diff --git a/tensor2tensor/data_generators/text_encoder_test.py b/tensor2tensor/data_generators/text_encoder_test.py index 7ac2ba911..4142f8699 100644 --- a/tensor2tensor/data_generators/text_encoder_test.py +++ b/tensor2tensor/data_generators/text_encoder_test.py @@ -18,8 +18,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from __future__ import unicode_literals + +import collections # Dependency imports +import mock from tensor2tensor.data_generators import text_encoder import tensorflow as tf @@ -29,40 +33,113 @@ class EscapeUnescapeTokenTest(tf.test.TestCase): def test_escape_token(self): escaped = text_encoder._escape_token( - u'Foo! Bar.\nunder_score back\\slash', + 'Foo! Bar.\nunder_score back\\slash', set('abcdefghijklmnopqrstuvwxyz .\n') | text_encoder._ESCAPE_CHARS) self.assertEqual( - u'\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_', escaped) + '\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_', escaped) def test_unescape_token(self): unescaped = text_encoder._unescape_token( - u'\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_') + '\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_') self.assertEqual( - u'Foo! Bar.\nunder_score back\\slash', unescaped) + 'Foo! Bar.\nunder_score back\\slash', unescaped) class SubwordTextEncoderTest(tf.test.TestCase): def test_encode_decode(self): - token_counts = { - u'this': 9, - u'sentence': 14, - u'the': 100, - u'encoded': 1, - u'was': 20, - u'by': 50, - } + corpus = ( + 'This is a corpus of text that provides a bunch of tokens from which ' + 'to build a vocabulary. It will be used when strings are encoded ' + 'with a TextEncoder subclass. The encoder was coded by a coder.') + token_counts = collections.Counter(corpus.split(' ')) + alphabet = set(corpus) ^ {' '} + + original = 'This is a coded sentence encoded by the SubwordTextEncoder.' + token_counts.update(original.split(' ')) + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( - 50, token_counts, 2, 10) - encoder.build_from_token_counts(token_counts, min_count=2) + 100, token_counts, 2, 10) - original = 'This sentence was encoded by the SubwordTextEncoder.' + # Encoding should be reversible. encoded = encoder.encode(original) decoded = encoder.decode(encoded) self.assertEqual(original, decoded) + # The substrings coded and coder are frequent enough in the corpus that + # they should appear in the vocabulary even though they are substrings + # of other included strings. + subtoken_strings = {encoder._all_subtoken_strings[i] for i in encoded} + self.assertIn('encoded_', subtoken_strings) + self.assertIn('coded_', subtoken_strings) + self.assertIn('TextEncoder', encoder._all_subtoken_strings) + self.assertIn('coder', encoder._all_subtoken_strings) + + # Every character in the corpus should be in the encoder's alphabet and + # its subtoken vocabulary. + self.assertTrue(alphabet.issubset(encoder._alphabet)) + for a in alphabet: + self.assertIn(a, encoder._all_subtoken_strings) + + def test_unicode(self): + corpus = 'Cat emoticons. \U0001F638 \U0001F639 \U0001F63A \U0001F63B' + token_counts = collections.Counter(corpus.split(' ')) + + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( + 100, token_counts, 2, 10) + + self.assertIn('\U0001F638', encoder._alphabet) + self.assertIn('\U0001F63B', encoder._all_subtoken_strings) + + def test_small_vocab(self): + corpus = 'The quick brown fox jumps over the lazy dog' + token_counts = collections.Counter(corpus.split(' ')) + alphabet = set(corpus) ^ {' '} + + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( + 10, token_counts, 2, 10) + + # All vocabulary elements are in the alphabet and subtoken strings even + # if we requested a smaller vocabulary to assure all expected strings + # are encodable. + self.assertTrue(alphabet.issubset(encoder._alphabet)) + for a in alphabet: + self.assertIn(a, encoder._all_subtoken_strings) + + def test_encodable_when_not_in_alphabet(self): + corpus = 'the quick brown fox jumps over the lazy dog' + token_counts = collections.Counter(corpus.split(' ')) + + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( + 100, token_counts, 2, 10) + original = 'This has UPPER CASE letters that are out of alphabet' + + # Early versions could have an infinite loop when breaking into subtokens + # if there was any out-of-alphabet characters in the encoded string. + encoded = encoder.encode(original) + decoded = encoder.decode(encoded) + + self.assertEqual(original, decoded) + encoded_str = ''.join(encoder._all_subtoken_strings[i] for i in encoded) + self.assertIn('\\84;', encoded_str) + + @mock.patch.object(text_encoder, '_ESCAPE_CHARS', new=set('\\_;13579')) + def test_raises_exception_when_not_encodable(self): + corpus = 'the quick brown fox jumps over the lazy dog' + token_counts = collections.Counter(corpus.split(' ')) + + # Deliberately exclude some required encoding chars from the alphabet + # and token list, making some strings unencodable. + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( + 100, token_counts, 2, 10) + original = 'This has UPPER CASE letters that are out of alphabet' + + # Previously there was a bug which produced an infinite loop in this case. + with self.assertRaises(AssertionError): + encoder.encode(original) + if __name__ == '__main__': tf.test.main() diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py index 0f4141199..1acffc04c 100644 --- a/tensor2tensor/data_generators/tokenizer.py +++ b/tensor2tensor/data_generators/tokenizer.py @@ -30,7 +30,7 @@ alphanumeric character and a non-alphanumeric character. This produces a list which alternates between "alphanumeric tokens" (strings of alphanumeric characters) and "non-alphanumeric tokens" - (strings of of non-alphanumeric characters). + (strings of non-alphanumeric characters). 2. Remove every token consisting of a single space, unless it is the very first or very last token in the list. These tokens are now @@ -44,28 +44,26 @@ from __future__ import division from __future__ import print_function -from collections import defaultdict +import collections import sys import unicodedata # Dependency imports -from six import PY2 -from six import unichr # pylint: disable=redefined-builtin -from six.moves import xrange # pylint: disable=redefined-builtin - +import six import tensorflow as tf +xrange = six.moves.xrange # pylint: disable=redefined-builtin # Conversion between Unicode and UTF-8, if required (on Python2) -_native_to_unicode = (lambda s: s.decode("utf-8")) if PY2 else (lambda s: s) +_native_to_unicode = (lambda s: s.decode("utf-8")) if six.PY2 else (lambda s: s) # This set contains all letter and number characters. _ALPHANUMERIC_CHAR_SET = set( - unichr(i) for i in xrange(sys.maxunicode) - if (unicodedata.category(unichr(i)).startswith("L") or - unicodedata.category(unichr(i)).startswith("N"))) + six.unichr(i) for i in xrange(sys.maxunicode) + if (unicodedata.category(six.unichr(i)).startswith("L") or + unicodedata.category(six.unichr(i)).startswith("N"))) def encode(text): @@ -110,42 +108,86 @@ def decode(tokens): return "".join(ret) -def corpus_token_counts(text_filepattern, corpus_max_lines, - split_on_newlines=True): +def _read_filepattern(filepattern, max_lines=None, split_on_newlines=True): + """Reads files matching a wildcard pattern, yielding the contents. + + Args: + filepattern: A wildcard pattern matching one or more files. + max_lines: If set, stop reading after reading this many lines. + split_on_newlines: A boolean. If true, then split files by lines and strip + leading and trailing whitespace from each line. Otherwise, treat each + file as a single string. + + Yields: + The contents of the files as lines, if split_on_newlines is True, or + the entire contents of each file if False. + """ + filenames = tf.gfile.Glob(filepattern) + lines_read = 0 + for filename in filenames: + with tf.gfile.Open(filename) as f: + if split_on_newlines: + for line in f: + yield line.strip() + lines_read += 1 + if max_lines and lines_read >= max_lines: + return + + else: + if max_lines: + doc = [] + for line in f: + doc.append(line) + lines_read += 1 + if max_lines and lines_read >= max_lines: + yield "".join(doc) + return + yield "".join(doc) + + else: + yield f.read() + + +def corpus_token_counts( + text_filepattern, corpus_max_lines, split_on_newlines=True): """Read the corpus and compute a dictionary of token counts. Args: - text_filepattern: a pattern matching one or more files - corpus_max_lines: an integer - maximum total lines to read. - split_on_newlines: a boolean. If true, then split files by lines and strip - leading and trailing whitespace from each line. + text_filepattern: A pattern matching one or more files. + corpus_max_lines: An integer; maximum total lines to read. + split_on_newlines: A boolean. If true, then split files by lines and strip + leading and trailing whitespace from each line. Otherwise, treat each + file as a single string. Returns: - a dictionary from token to count. + a dictionary mapping token to count. """ - def read_corpus(): - """Read the corpus.""" - docs = [] - lines_read = 0 - filenames = tf.gfile.Glob(text_filepattern) - for text_filename in filenames: - with tf.gfile.Open(text_filename) as f: - if not split_on_newlines: - docs.append("") - for line in f: - if split_on_newlines: - # The tokenizer updates token_counts in encode() - docs.append(line.strip()) - else: - docs[-1] += line - lines_read += 1 - if corpus_max_lines > 0 and lines_read > corpus_max_lines: - return docs - return docs - - counts = defaultdict(int) - for doc in read_corpus(): - for tok in encode(_native_to_unicode(doc)): - counts[tok] += 1 + counts = collections.Counter() + for doc in _read_filepattern( + text_filepattern, + max_lines=corpus_max_lines, + split_on_newlines=split_on_newlines): + counts.update(encode(_native_to_unicode(doc))) + return counts + +def vocab_token_counts(text_filepattern, max_lines): + """Read a vocab file and return a dictionary of token counts. + + Reads a two-column CSV file of tokens and their frequency in a dataset. The + tokens are presumed to be generated by encode() or the equivalent. + + Args: + text_filepattern: A pattern matching one or more files. + max_lines: An integer; maximum total lines to read. + + Returns: + a dictionary mapping token to count. + """ + ret = {} + for line in _read_filepattern(text_filepattern, max_lines=max_lines): + token, count = line.rsplit(",", 1) + ret[_native_to_unicode(token)] = int(count) + + return ret diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py index 189f19663..792ef4dbb 100644 --- a/tensor2tensor/data_generators/tokenizer_test.py +++ b/tensor2tensor/data_generators/tokenizer_test.py @@ -20,45 +20,132 @@ from __future__ import division from __future__ import print_function +import os import random # Dependency imports -from six import unichr # pylint: disable=redefined-builtin -from six.moves import xrange # pylint: disable=redefined-builtin +import six from tensor2tensor.data_generators import tokenizer - import tensorflow as tf +xrange = six.moves.xrange # pylint: disable=redefined-builtin + +FLAGS = tf.app.flags.FLAGS + +_TESTDATA = "google3/third_party/py/tensor2tensor/data_generators/test_data" + class TokenizerTest(tf.test.TestCase): - def testEncode(self): - self.assertEqual( - tokenizer.encode(u"Dude - that's so cool."), - [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]) - self.assertEqual( - tokenizer.encode(u"Łukasz est né en 1981."), - [u"Łukasz", u"est", u"né", u"en", u"1981", u"."]) - self.assertEqual( - tokenizer.encode(u" Spaces at the ends "), - [u" ", u"Spaces", u"at", u"the", u"ends", u" "]) - self.assertEqual(tokenizer.encode(u"802.11b"), [u"802", u".", u"11b"]) - self.assertEqual(tokenizer.encode(u"two. \nlines"), - [u"two", u". \n", u"lines"]) + def test_encode(self): + self.assertListEqual( + [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."], + tokenizer.encode(u"Dude - that's so cool.")) + self.assertListEqual( + [u"Łukasz", u"est", u"né", u"en", u"1981", u"."], + tokenizer.encode(u"Łukasz est né en 1981.")) + self.assertListEqual( + [u" ", u"Spaces", u"at", u"the", u"ends", u" "], + tokenizer.encode(u" Spaces at the ends ")) + self.assertListEqual( + [u"802", u".", u"11b"], + tokenizer.encode(u"802.11b")) + self.assertListEqual( + [u"two", u". \n", u"lines"], + tokenizer.encode(u"two. \nlines")) - def testDecode(self): + def test_decode(self): self.assertEqual( + u"Dude - that's so cool.", tokenizer.decode( - [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]), - u"Dude - that's so cool.") + [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."])) - def testInvertibilityOnRandomStrings(self): - random.seed(123) + def test_invertibility_on_random_strings(self): for _ in xrange(1000): - s = u"".join([unichr(random.randint(0, 65535)) for _ in xrange(10)]) + s = u"".join( + six.unichr(random.randint(0, 65535)) for _ in xrange(10)) self.assertEqual(s, tokenizer.decode(tokenizer.encode(s))) +class TestTokenCounts(tf.test.TestCase): + + def setUp(self): + super(TestTokenCounts, self).setUp() + self.corpus_path = os.path.join( + FLAGS.test_srcdir, _TESTDATA, "corpus-*.txt") + self.vocab_path = os.path.join( + FLAGS.test_srcdir, _TESTDATA, "vocab-*.txt") + + def test_corpus_token_counts_split_on_newlines(self): + token_counts = tokenizer.corpus_token_counts( + self.corpus_path, corpus_max_lines=0, split_on_newlines=True) + + expected = { + u"'": 2, + u".": 2, + u". ": 1, + u"... ": 1, + u"Groucho": 1, + u"Marx": 1, + u"Mitch": 1, + u"Hedberg": 1, + u"I": 3, + u"in": 2, + u"my": 2, + u"pajamas": 2, + } + self.assertDictContainsSubset(expected, token_counts) + self.assertNotIn(u".\n\n", token_counts) + self.assertNotIn(u"\n", token_counts) + + def test_corpus_token_counts_no_split_on_newlines(self): + token_counts = tokenizer.corpus_token_counts( + self.corpus_path, corpus_max_lines=0, split_on_newlines=False) + + self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts) + + def test_corpus_token_counts_split_with_max_lines(self): + token_counts = tokenizer.corpus_token_counts( + self.corpus_path, corpus_max_lines=5, split_on_newlines=True) + + self.assertIn(u"slept", token_counts) + self.assertNotIn(u"Mitch", token_counts) + + def test_corpus_token_counts_no_split_with_max_lines(self): + token_counts = tokenizer.corpus_token_counts( + self.corpus_path, corpus_max_lines=5, split_on_newlines=False) + + self.assertIn(u"slept", token_counts) + self.assertNotIn(u"Mitch", token_counts) + self.assertDictContainsSubset( + {u".\n\n": 1, u"\n": 2, u".\n": 1}, token_counts) + + def test_vocab_token_counts(self): + token_counts = tokenizer.vocab_token_counts( + self.vocab_path, 0) + + expected = { + "lollipop": 8, + "reverberated": 12, + "kattywampus": 11, + "balderdash": 10, + "jiggery-pokery": 14, + } + self.assertDictEqual(expected, token_counts) + + def test_vocab_token_counts_with_max_lines(self): + token_counts = tokenizer.vocab_token_counts( + self.vocab_path, 4) + + expected = { + "lollipop": 8, + "reverberated": 12, + "kattywampus": 11, + "balderdash": 10, + } + self.assertDictEqual(expected, token_counts) + + if __name__ == "__main__": tf.test.main() From c01617efd2a2f321633ffaeaebc8697d46ed0dc0 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Wed, 26 Jul 2017 15:01:19 -0700 Subject: [PATCH 17/21] Use TensorFlow idiom for importing six.moves.xrange. PiperOrigin-RevId: 163261434 --- tensor2tensor/data_generators/text_encoder.py | 3 +-- tensor2tensor/data_generators/tokenizer.py | 3 +-- tensor2tensor/data_generators/tokenizer_test.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 69d29779a..4bb1c875d 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -30,12 +30,11 @@ # Dependency imports import six +from six.moves import xrange # pylint: disable=redefined-builtin from tensor2tensor.data_generators import tokenizer import tensorflow as tf -xrange = six.moves.xrange # pylint: disable=redefined-builtin - # Reserved tokens for things like padding and EOS symbols. PAD = "" EOS = "" diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py index 1acffc04c..5cb9fd32b 100644 --- a/tensor2tensor/data_generators/tokenizer.py +++ b/tensor2tensor/data_generators/tokenizer.py @@ -51,10 +51,9 @@ # Dependency imports import six +from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf -xrange = six.moves.xrange # pylint: disable=redefined-builtin - # Conversion between Unicode and UTF-8, if required (on Python2) _native_to_unicode = (lambda s: s.decode("utf-8")) if six.PY2 else (lambda s: s) diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py index 792ef4dbb..ad4a3ff04 100644 --- a/tensor2tensor/data_generators/tokenizer_test.py +++ b/tensor2tensor/data_generators/tokenizer_test.py @@ -26,11 +26,10 @@ # Dependency imports import six +from six.moves import xrange # pylint: disable=redefined-builtin from tensor2tensor.data_generators import tokenizer import tensorflow as tf -xrange = six.moves.xrange # pylint: disable=redefined-builtin - FLAGS = tf.app.flags.FLAGS _TESTDATA = "google3/third_party/py/tensor2tensor/data_generators/test_data" From 5242ac6e59cf553820d31485509fc527339ada92 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 26 Jul 2017 17:33:54 -0700 Subject: [PATCH 18/21] Rm num_shards from Problem. Problems specify sharding themselves. PiperOrigin-RevId: 163281576 --- README.md | 1 - tensor2tensor/bin/t2t-datagen | 11 +++++++---- tensor2tensor/data_generators/algorithmic.py | 11 ++++------- tensor2tensor/data_generators/genetics.py | 14 ++++++++------ tensor2tensor/data_generators/image.py | 2 +- tensor2tensor/data_generators/problem.py | 2 +- tensor2tensor/data_generators/wmt.py | 6 ++---- 7 files changed, 23 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index c0e34e0fe..edd6460d0 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,6 @@ mkdir -p $DATA_DIR $TMP_DIR $TRAIN_DIR t2t-datagen \ --data_dir=$DATA_DIR \ --tmp_dir=$TMP_DIR \ - --num_shards=100 \ --problem=$PROBLEM # Train diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index 629014713..e4acb6731 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -63,7 +63,8 @@ flags.DEFINE_string("problem", "", "The name of the problem to generate data for.") flags.DEFINE_string("exclude_problems", "", "Comma-separates list of problems to exclude.") -flags.DEFINE_integer("num_shards", 10, "How many shards to use.") +flags.DEFINE_integer("num_shards", 0, "How many shards to use. Ignored for " + "registered Problems.") flags.DEFINE_integer("max_cases", 0, "Maximum number of cases to generate (unbounded if 0).") flags.DEFINE_integer("random_seed", 429459, "Random seed to use.") @@ -252,7 +253,7 @@ def generate_data_for_problem(problem): if isinstance(dev_gen, int): # The dev set and test sets are generated as extra shards using the # training generator. The integer specifies the number of training - # shards. FLAGS.num_shards is ignored. + # shards. FLAGS.num_shards is ignored. num_training_shards = dev_gen tf.logging.info("Generating data for %s.", problem) all_output_files = generator_utils.combined_data_filenames( @@ -263,10 +264,11 @@ def generate_data_for_problem(problem): else: # usual case - train data and dev data are generated using separate # generators. + num_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, - FLAGS.num_shards) + num_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) tf.logging.info("Generating development data for %s.", problem) @@ -282,11 +284,12 @@ def generate_data_for_problem(problem): def generate_data_for_registered_problem(problem_name): tf.logging.info("Generating training data for %s.", problem_name) + if FLAGS.num_shards: + raise ValueError("--num_shards should not be set for registered Problem.") problem = registry.problem(problem_name) task_id = None if FLAGS.task_id < 0 else FLAGS.task_id problem.generate_data(os.path.expanduser(FLAGS.data_dir), os.path.expanduser(FLAGS.tmp_dir), - num_shards=FLAGS.num_shards, task_id=task_id) diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py index 017bc8470..c115a1ebe 100644 --- a/tensor2tensor/data_generators/algorithmic.py +++ b/tensor2tensor/data_generators/algorithmic.py @@ -66,10 +66,7 @@ def dev_size(self): def num_shards(self): return 10 - def generate_data(self, data_dir, _, num_shards=None, task_id=-1): - if num_shards is None: - num_shards = self.num_shards - + def generate_data(self, data_dir, _, task_id=-1): def generator_eos(generator): """Shift by NUM_RESERVED_IDS and append EOS token.""" for case in generator: @@ -87,7 +84,7 @@ def generator_eos(generator): utils.generate_dataset_and_shuffle( train_generator_eos(), - self.training_filepaths(data_dir, num_shards, shuffled=True), + self.training_filepaths(data_dir, self.num_shards, shuffled=True), dev_generator_eos(), self.dev_filepaths(data_dir, 1, shuffled=True), shuffle=False) @@ -254,7 +251,7 @@ def zipf_distribution(nbr_symbols, alpha): def zipf_random_sample(distr_map, sample_len): - """Helper function: Generate a random Zipf sample of given lenght. + """Helper function: Generate a random Zipf sample of given length. Args: distr_map: list of float, Zipf's distribution over nbr_symbols. @@ -287,7 +284,7 @@ def reverse_generator_nlplike(nbr_symbols, max_length: integer, maximum length of sequences to generate. nbr_cases: the number of cases to generate. scale_std_dev: float, Normal distribution's standard deviation scale factor - used to draw the lenght of sequence. Default = 1% of the max_length. + used to draw the length of sequence. Default = 1% of the max_length. alpha: float, Zipf's Law Distribution parameter. Default = 1.5. Usually for modelling natural text distribution is in the range [1.1-1.6]. diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py index 88b82cb49..4e8a6d987 100644 --- a/tensor2tensor/data_generators/genetics.py +++ b/tensor2tensor/data_generators/genetics.py @@ -87,10 +87,11 @@ def feature_encoders(self, data_dir): "targets": text_encoder.TextEncoder() } - def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): - if num_shards is None: - num_shards = 100 + @property + def num_shards(self): + return 100 + def generate_data(self, data_dir, tmp_dir, task_id=-1): try: # Download source data if download_url specified h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file, @@ -109,7 +110,7 @@ def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): # Collect created shard processes to start and join processes = [] - datasets = [(self.training_filepaths, num_shards, "train", + datasets = [(self.training_filepaths, self.num_shards, "train", num_train_examples), (self.dev_filepaths, 1, "valid", num_dev_examples), (self.test_filepaths, 1, "test", num_test_examples)] @@ -124,9 +125,10 @@ def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): start_idx, end_idx)) processes.append(p) - # Start and wait for processes in batches - assert len(processes) == num_shards + 2 # 1 per training shard + dev + test + # 1 per training shard + dev + test + assert len(processes) == self.num_shards + 2 + # Start and wait for processes in batches num_batches = int( math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES)) for i in xrange(num_batches): diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index acb1128ed..fdad8d432 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -338,7 +338,7 @@ def example_reading_spec(self, label_key=None): class ImageFSNS(ImageProblem): """Problem spec for French Street Name recognition.""" - def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): + def generate_data(self, data_dir, tmp_dir, task_id=-1): list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" "street/python/fsns_urls.txt") fsns_urls = generator_utils.maybe_download( diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 69d81e58e..67e3c6f90 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -135,7 +135,7 @@ class Problem(object): # BEGIN SUBCLASS INTERFACE # ============================================================================ - def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): + def generate_data(self, data_dir, tmp_dir, task_id=-1): raise NotImplementedError() def hparams(self, defaults, model_hparams): diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 9587d4d2a..97b191096 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -83,12 +83,10 @@ def vocab_name(self): def vocab_file(self): return "%s.%d" % (self.vocab_name, self.targeted_vocab_size) - def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1): - if num_shards is None: - num_shards = self.num_shards + def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( self.train_generator(data_dir, tmp_dir, True), - self.training_filepaths(data_dir, num_shards, shuffled=False), + self.training_filepaths(data_dir, self.num_shards, shuffled=False), self.dev_generator(data_dir, tmp_dir), self.dev_filepaths(data_dir, 1, shuffled=False)) From 93b325f420d85d934d6280b316a248eca982c192 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 26 Jul 2017 18:18:04 -0700 Subject: [PATCH 19/21] Baseline model for GeneExpression problem PiperOrigin-RevId: 163286026 --- tensor2tensor/data_generators/all_problems.py | 2 +- .../{genetics.py => gene_expression.py} | 39 +++--- ...netics_test.py => gene_expression_test.py} | 8 +- tensor2tensor/models/common_layers.py | 31 ++-- tensor2tensor/models/gene_expression.py | 132 ++++++++++++++++++ tensor2tensor/models/gene_expression_test.py | 79 +++++++++++ tensor2tensor/models/modalities.py | 15 +- tensor2tensor/models/models.py | 1 + tensor2tensor/utils/metrics.py | 2 +- tensor2tensor/utils/trainer_utils.py | 12 +- 10 files changed, 279 insertions(+), 42 deletions(-) rename tensor2tensor/data_generators/{genetics.py => gene_expression.py} (90%) rename tensor2tensor/data_generators/{genetics_test.py => gene_expression_test.py} (89%) create mode 100644 tensor2tensor/models/gene_expression.py create mode 100644 tensor2tensor/models/gene_expression_test.py diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py index d8007f5e3..6830cf0bf 100644 --- a/tensor2tensor/data_generators/all_problems.py +++ b/tensor2tensor/data_generators/all_problems.py @@ -34,7 +34,7 @@ # pylint: disable=g-import-not-at-top try: # Requires h5py - from tensor2tensor.data_generators import genetics + from tensor2tensor.data_generators import gene_expression except ImportError: pass # pylint: enable=g-import-not-at-top diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/gene_expression.py similarity index 90% rename from tensor2tensor/data_generators/genetics.py rename to tensor2tensor/data_generators/gene_expression.py index 4e8a6d987..31d1cd150 100644 --- a/tensor2tensor/data_generators/genetics.py +++ b/tensor2tensor/data_generators/gene_expression.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Genetics problems. +"""Gene expression problems. Inputs are bases ACTG (with indices assigned in that order). @@ -82,7 +82,7 @@ def chunk_size(self): def feature_encoders(self, data_dir): del data_dir return { - "inputs": GeneticBaseEncoder(chunk_size=self.chunk_size), + "inputs": DNAEncoder(chunk_size=self.chunk_size), # TODO(rsepassi): RealEncoder? "targets": text_encoder.TextEncoder() } @@ -166,8 +166,15 @@ def example_reading_spec(self): def preprocess_examples(self, examples, mode): del mode + # Reshape targets examples["targets"] = tf.reshape(examples["targets"], [-1, 1, self.num_output_predictions]) + examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1, 1]) + + # Set masked targets to 0 (i.e. pad) so that loss and metrics ignore them. + # Add epsilon because some unmasked labels are actually 0. + examples["targets"] += 1e-6 + examples["targets"] *= examples["targets_mask"] return examples @@ -175,8 +182,8 @@ def eval_metrics(self): return [metrics.Metrics.RMSE] -@registry.register_problem("genetics_cage10") -class GeneticsCAGE10(GeneExpressionProblem): +@registry.register_problem("gene_expression_cage10") +class GeneExpressionCAGE10(GeneExpressionProblem): @property def download_url(self): @@ -187,8 +194,8 @@ def h5_file(self): return "cage10.h5" -@registry.register_problem("genetics_gm12878") -class GeneticsGM12878(GeneExpressionProblem): +@registry.register_problem("gene_expression_gm12878") +class GeneExpressionGM12878(GeneExpressionProblem): @property def download_url(self): @@ -199,8 +206,8 @@ def h5_file(self): return "gm12878.h5" -@registry.register_problem("genetics_l262k") -class GeneticsL262k(GeneExpressionProblem): +@registry.register_problem("gene_expression_l262k") +class GeneExpressionL262k(GeneExpressionProblem): @property def h5_file(self): @@ -236,7 +243,7 @@ def dataset_generator(filepath, chunk_size=1, start_idx=None, end_idx=None): - encoder = GeneticBaseEncoder(chunk_size=chunk_size) + encoder = DNAEncoder(chunk_size=chunk_size) with h5py.File(filepath, "r") as h5_file: # Get input keys from h5_file src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]] @@ -291,7 +298,7 @@ def to_example_dict(encoder, inputs, mask, outputs): return ex_dict -class GeneticBaseEncoder(text_encoder.TextEncoder): +class DNAEncoder(text_encoder.TextEncoder): """ACTG strings to ints and back. Optionally chunks bases into single ids. Uses 'X' as an unknown base. @@ -302,14 +309,14 @@ class GeneticBaseEncoder(text_encoder.TextEncoder): def __init__(self, chunk_size=1, num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS): - super(GeneticBaseEncoder, self).__init__(num_reserved_ids=num_reserved_ids) + super(DNAEncoder, self).__init__(num_reserved_ids=num_reserved_ids) # Build a vocabulary of chunks of size chunk_size self._chunk_size = chunk_size chunks = [] for size in range(1, chunk_size + 1): - c = itertools.product(_bases + [GeneticBaseEncoder.UNK], repeat=size) + c = itertools.product(_bases + [DNAEncoder.UNK], repeat=size) num_pad = chunk_size - size - padding = (GeneticBaseEncoder.PAD,) * num_pad + padding = (DNAEncoder.PAD,) * num_pad c = [el + padding for el in c] chunks.extend(c) chunks.sort() @@ -323,7 +330,7 @@ def vocab_size(self): def encode(self, s): bases = list(s) - pad = [GeneticBaseEncoder.PAD] * (len(bases) % self._chunk_size) + pad = [DNAEncoder.PAD] * (len(bases) % self._chunk_size) bases.extend(pad) assert (len(bases) % self._chunk_size) == 0 num_chunks = len(bases) // self._chunk_size @@ -342,8 +349,8 @@ def decode(self, ids): for idx in ids: if idx >= self._num_reserved_ids: chunk = self._ids_to_chunk[idx] - if GeneticBaseEncoder.PAD in chunk: - chunk = chunk[:chunk.index(GeneticBaseEncoder.PAD)] + if DNAEncoder.PAD in chunk: + chunk = chunk[:chunk.index(DNAEncoder.PAD)] else: chunk = [text_encoder.RESERVED_TOKENS[idx]] bases.extend(chunk) diff --git a/tensor2tensor/data_generators/genetics_test.py b/tensor2tensor/data_generators/gene_expression_test.py similarity index 89% rename from tensor2tensor/data_generators/genetics_test.py rename to tensor2tensor/data_generators/gene_expression_test.py index 5eac1b249..2d7bbe832 100644 --- a/tensor2tensor/data_generators/genetics_test.py +++ b/tensor2tensor/data_generators/gene_expression_test.py @@ -22,7 +22,7 @@ import numpy as np -from tensor2tensor.data_generators import genetics +from tensor2tensor.data_generators import gene_expression import tensorflow as tf @@ -40,7 +40,7 @@ def _oneHotBases(self, bases): return np.array(one_hots) def testRecordToExample(self): - encoder = genetics.GeneticBaseEncoder(chunk_size=2) + encoder = gene_expression.DNAEncoder(chunk_size=2) raw_inputs = ["A", "C", "G", "X", "C", "T"] # Put in numpy arrays in the same format as in the h5 file @@ -48,7 +48,7 @@ def testRecordToExample(self): mask = np.array([True, False, True]) outputs = np.array([[1.0, 2.0, 3.0], [5.0, 1.0, 0.2], [5.1, 2.3, 2.3]]) # Convert to example dict - ex_dict = genetics.to_example_dict(encoder, inputs, mask, outputs) + ex_dict = gene_expression.to_example_dict(encoder, inputs, mask, outputs) self.assertEqual(len(raw_inputs) // 2 + 1, len(ex_dict["inputs"])) self.assertAllEqual(encoder.encode(raw_inputs) + [1], ex_dict["inputs"]) @@ -61,7 +61,7 @@ def testGenerateShardArgs(self): num_examples = 37 num_shards = 4 outfiles = [str(i) for i in range(num_shards)] - shard_args = genetics.generate_shard_args(outfiles, num_examples) + shard_args = gene_expression.generate_shard_args(outfiles, num_examples) starts, ends, fnames = zip(*shard_args) self.assertAllEqual([0, 9, 18, 27], starts) diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py index 37e791bc3..e98531d88 100644 --- a/tensor2tensor/models/common_layers.py +++ b/tensor2tensor/models/common_layers.py @@ -469,7 +469,10 @@ def get_norm(norm_type): "'noam', 'none'.") -def residual_fn(x, y, norm_type, residual_dropout, +def residual_fn(x, + y, + norm_type, + residual_dropout, filters=None, epsilon=1e-16, name="residual"): @@ -559,11 +562,17 @@ def conv_block_internal(conv_fn, def conv_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs): - """A block of standard convolutions.""" + """A block of standard 2d convolutions.""" return conv_block_internal(conv, inputs, filters, dilation_rates_and_kernel_sizes, **kwargs) +def conv1d_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs): + """A block of standard 1d convolutions.""" + return conv_block_internal(conv1d, inputs, filters, + dilation_rates_and_kernel_sizes, **kwargs) + + def separable_conv_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs): """A block of separable convolutions.""" @@ -858,10 +867,7 @@ def multiscale_conv_sum(inputs, output_size, dilation_rates_and_kernel_sizes, return tf.add_n(results) * (len(results)**-0.5) -def multiscale_conv_and_attention(x, - padding, - hparams, - source=None): +def multiscale_conv_and_attention(x, padding, hparams, source=None): """A common part of t2t layers. First, do a linear multiscale convolution @@ -925,10 +931,7 @@ def conv_with_pools(inputs, output_size, kernel_size, pool_sizes, pooling_type, return tf.add_n(results) * (len(results)**-0.5) -def conv_with_pools_and_attention(x, - padding, - hparams, - source=None): +def conv_with_pools_and_attention(x, padding, hparams, source=None): """A common part of t2t layers. First, do conv_with_pools @@ -1389,8 +1392,8 @@ def padded_cross_entropy(logits, vocab_size = tf.shape(logits)[-1] with tf.name_scope("padded_cross_entropy", [logits, labels]): pad_logits, pad_labels = pad_with_zeros(logits, labels) - xent = smoothing_cross_entropy(pad_logits, pad_labels, - vocab_size, confidence) + xent = smoothing_cross_entropy(pad_logits, pad_labels, vocab_size, + confidence) weights = weights_fn(pad_labels) if not reduce_sum: return xent * weights, weights @@ -1493,8 +1496,8 @@ def linear_set_layer(layer_size, # Unfortunately tf doesn't support broadcasting via concat, but we can # simply add the transformed context to get the same effect. context = tf.expand_dims(context, axis=1) - cont_tfm = conv1d(context, layer_size, 1, - activation=None, name="cont_conv") + cont_tfm = conv1d( + context, layer_size, 1, activation=None, name="cont_conv") outputs += cont_tfm if activation_fn is not None: diff --git a/tensor2tensor/models/gene_expression.py b/tensor2tensor/models/gene_expression.py new file mode 100644 index 000000000..bdb93509b --- /dev/null +++ b/tensor2tensor/models/gene_expression.py @@ -0,0 +1,132 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Models for gene expression from DNA.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.models import common_hparams +from tensor2tensor.models import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +@registry.register_model +class GeneExpressionConv(t2t_model.T2TModel): + """Gene expression conv net. + + Based on "Basenji" model from + http://www.biorxiv.org/content/early/2017/07/10/161851 + + Uses layer_norm instead of batch_norm. + """ + + def model_fn_body(self, features): + inputs = features["inputs"] + inputs.get_shape().assert_has_rank(4) + + hp = self._hparams + + out = inputs + out = common_layers.flatten4d3d(out) + + # Conv layers + for i in xrange(hp.num_conv_layers): + out = conv_layer( + out, + hp.hidden_size, + hp.kernel_width, + hp.stride, + hp.pooling_windows[i], + hp.dropout, + 1, + name="conv_%d" % (i + 1)) + + # Dense dilated conv layers + for i in xrange(hp.num_dconv_layers): + dilation_rate = 2**(i + 1) + dconv_out = conv_layer( + out, + hp.hidden_size, + hp.kernel_width, + 1, + 0, + hp.dropout, + dilation_rate, + name="dconv_%d" % (i + 1)) + out = tf.concat([out, dconv_out], axis=2) + + # Fully connected layer + out = fc_layer(out, hp.hidden_size, hp.dropout, name="fc") + + out.get_shape().assert_has_rank(3) + out = tf.expand_dims(out, 2) + return out + + +def conv_layer(x, + hidden_size, + kernel_size, + stride, + pooling_window, + dropout_rate, + dilation_rate, + name="conv"): + with tf.variable_scope(name): + out = x + out = common_layers.conv1d_block( + out, + hidden_size, [(dilation_rate, kernel_size)], + strides=stride, + first_relu=False, + padding="same") + out = tf.nn.relu(out) + if pooling_window: + out = tf.layers.max_pooling1d( + out, pooling_window, pooling_window, padding="same") + out = tf.layers.dropout(out, dropout_rate) + return out + + +def fc_layer(x, num_out, dropout_rate, name="fc"): + with tf.variable_scope(name): + out = x + out = tf.layers.dense(out, num_out) + out = tf.contrib.layers.layer_norm(out) + out = tf.nn.relu(out) + out = tf.layers.dropout(out, dropout_rate) + return out + + +@registry.register_hparams +def gene_expression_conv_base(): + """Hparams for GeneExpressionConv model.""" + hparams = common_hparams.basic_params1() + hparams.add_hparam("num_conv_layers", 4) + hparams.add_hparam("num_dconv_layers", 7) + hparams.add_hparam("pooling_windows", [2, 4, 4, 4]) + + # TODO(rsepassi): Correct the values of these hyperparameters + hparams.hidden_size = 128 + hparams.kernel_width = 128 + hparams.add_hparam("stride", 1) + return hparams diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py new file mode 100644 index 000000000..bec5268fd --- /dev/null +++ b/tensor2tensor/models/gene_expression_test.py @@ -0,0 +1,79 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Gene Expression models.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import gene_expression as gene_data +from tensor2tensor.models import gene_expression +from tensor2tensor.models import modalities # pylint: disable=unused-import + +import tensorflow as tf + + +def gene_expression_conv_test(): + hparams = gene_expression.gene_expression_conv_base() + hparams.hidden_size = 8 + hparams.num_dconv_layers = 2 + return hparams + + +class GeneExpressionModelsTest(tf.test.TestCase): + + def _testModel(self, hparams, model_cls): + batch_size = 3 + target_length = 6 + target_out = 10 # GeneExpressionProblem.num_output_predictions + input_length = target_length * 128 + input_vocab_size = 5 + + inputs = np.random.random_integers( + input_vocab_size, size=(batch_size, input_length, 1, 1)) + targets = np.random.random_sample((batch_size, target_length, 1, + target_out)) + + features = { + "inputs": tf.constant(inputs, dtype=tf.int32), + "targets": tf.constant(targets, dtype=tf.float32), + } + p_hparams, = hparams.problems + sharded_logits, _, _ = model_cls(hparams, tf.contrib.learn.ModeKeys.TRAIN, + p_hparams).model_fn(features) + logits = tf.concat(sharded_logits, 0) + + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + res = sess.run(logits) + + self.assertEqual(res.shape, (batch_size, target_length, 1, target_out)) + + def testGeneExpressionModels(self): + models_hparams = [(gene_expression.GeneExpressionConv, + gene_expression_conv_test())] + for model_cls, hparams in models_hparams: + hparams.add_hparam("data_dir", None) + p_hparams = gene_data.GeneExpressionCAGE10().internal_hparams(hparams) + hparams.problems = [p_hparams] + self._testModel(hparams, model_cls) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py index 50a3da55d..20464c0a2 100644 --- a/tensor2tensor/models/modalities.py +++ b/tensor2tensor/models/modalities.py @@ -166,7 +166,7 @@ def targets_bottom(self, inputs): def top(self, body_output, _): with tf.variable_scope("rgb_softmax"): - # seperate embedding for each channel + # separate embedding for each channel # assuming the body output returns a tensor of shape # [batch_size, rows, cols, channels, self._body_input_depth] body_output_split = tf.split(body_output, self._channels, axis=3) @@ -488,10 +488,15 @@ def top_sharded(self, sharded_targets) def l2_loss(predictions, targets): - return tf.reduce_mean(tf.pow(predictions - targets, 2)) - - loss = data_parallelism(l2_loss, sharded_predictions, sharded_targets) - return sharded_predictions, tf.add_n(loss) + with tf.name_scope("l2"): + weights = weights_fn(targets) + l2 = tf.pow(predictions - targets, 2) + return tf.reduce_sum(l2 * weights), tf.reduce_sum(weights) + + loss_num, loss_den = data_parallelism(l2_loss, sharded_predictions, + sharded_targets) + loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den)) + return sharded_predictions, loss @registry.register_image_modality("identity_no_pad") diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py index e92ddd3ed..907a801cf 100644 --- a/tensor2tensor/models/models.py +++ b/tensor2tensor/models/models.py @@ -27,6 +27,7 @@ from tensor2tensor.models import attention_lm_moe from tensor2tensor.models import bluenet from tensor2tensor.models import bytenet +from tensor2tensor.models import gene_expression from tensor2tensor.models import long_answer from tensor2tensor.models import lstm from tensor2tensor.models import modalities diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py index 29f44b574..ae9ce3882 100644 --- a/tensor2tensor/utils/metrics.py +++ b/tensor2tensor/utils/metrics.py @@ -39,7 +39,7 @@ class Metrics(object): RMSE = "rmse" -def padded_rmse(predictions, labels, weights_fn=common_layers.weights_all): +def padded_rmse(predictions, labels, weights_fn=common_layers.weights_nonzero): predictions, labels = common_layers.pad_with_zeros(predictions, labels) targets = labels weights = weights_fn(targets) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index bf42c36cc..1dbb84d4f 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -45,6 +45,7 @@ import tensorflow as tf from tensorflow.contrib.learn.python.learn import learn_runner +from tensorflow.python import debug from tensorflow.python.ops import init_ops # Number of samples to draw for an image input (in such cases as captioning) @@ -55,6 +56,8 @@ flags.DEFINE_bool("registry_help", False, "If True, logs the contents of the registry and exits.") +flags.DEFINE_bool("tfdbg", False, + "If True, use the TF debugger CLI on train/eval.") flags.DEFINE_string("output_dir", "", "Base output directory for run.") flags.DEFINE_string("model", "", "Which model to use.") flags.DEFINE_string("hparams_set", "", "Which parameters to use.") @@ -168,6 +171,12 @@ def create_experiment(output_dir, data_dir, model_name, train_steps, FLAGS.objective not in eval_metrics): raise ValueError("Tuning objective %s not among evaluation metrics %s" % (FLAGS.objective, eval_metrics.keys())) + train_monitors = [] + eval_hooks = [] + if FLAGS.tfdbg: + hook = debug.LocalCLIDebugHook() + train_monitors.append(hook) + eval_hooks.append(hook) return tf.contrib.learn.Experiment( estimator=estimator, train_input_fn=input_fns["train"], @@ -176,7 +185,8 @@ def create_experiment(output_dir, data_dir, model_name, train_steps, train_steps=train_steps, eval_steps=eval_steps, min_eval_frequency=FLAGS.local_eval_frequency, - train_monitors=[]) + train_monitors=train_monitors, + eval_hooks=eval_hooks) def create_experiment_components(hparams, output_dir, data_dir, model_name): From 175a125927961a366a023fa4925c15e39561e003 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 26 Jul 2017 19:19:52 -0700 Subject: [PATCH 20/21] v1.1.2 PiperOrigin-RevId: 163290663 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9da5293b9..66d51d7e1 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.1.1', + version='1.1.2', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', From 36766d84aa3da941be1f74efb10fbc4b409500d4 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 27 Jul 2017 11:49:11 -0700 Subject: [PATCH 21/21] internal-external fixes and enable tests PiperOrigin-RevId: 163370562 --- setup.py | 3 + .../data_generators/concatenate_examples.py | 21 +++--- tensor2tensor/data_generators/inspect.py | 24 +++---- .../text_encoder_build_subword.py | 36 +++++----- tensor2tensor/data_generators/tokenizer.py | 2 +- .../data_generators/tokenizer_test.py | 65 +++++++++---------- tensor2tensor/utils/trainer_utils.py | 4 +- 7 files changed, 74 insertions(+), 81 deletions(-) diff --git a/setup.py b/setup.py index 66d51d7e1..6be9aba04 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ url='http://github.com/tensorflow/tensor2tensor', license='Apache 2.0', packages=find_packages(), + package_data={'tensor2tensor.data_generators': ['test_data/*']}, scripts=[ 'tensor2tensor/bin/t2t-trainer', 'tensor2tensor/bin/t2t-datagen', @@ -26,6 +27,8 @@ 'tensorflow': ['tensorflow>=1.2.0rc1'], 'tensorflow_gpu': ['tensorflow-gpu>=1.2.0rc1'], }, + tests_require=['nose'], + test_suite='nose.collector', classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py index 60ac7ea8f..9d7678fc4 100644 --- a/tensor2tensor/data_generators/concatenate_examples.py +++ b/tensor2tensor/data_generators/concatenate_examples.py @@ -34,7 +34,7 @@ + subtokenizer.encode("target French Je t'aime.") + [1]) } -We add a dummy feature "inputs"=[0] for compatability with seq-to-seq models. +We add a dummy feature "inputs"=[0] for compatibility with seq-to-seq models. If FLAGS.combine_to_length is nonzero, then we combine multiple examples into examples of a constant length, possibly with some padding at the end. @@ -53,34 +53,33 @@ from tensor2tensor.data_generators import text_encoder import tensorflow as tf -tf.app.flags.DEFINE_string("vocab_file", "", - "SubwordTextEncoder vocabulary file") +tf.flags.DEFINE_string("vocab_file", "", "SubwordTextEncoder vocabulary file") -tf.app.flags.DEFINE_boolean( +tf.flags.DEFINE_boolean( "random_reverse", False, "If true, write half of the example with source/target reversed") -tf.app.flags.DEFINE_boolean( +tf.flags.DEFINE_boolean( "count_everything", False, "If true, assign positive weights to designators, source and target. " "If false, assign positive weights only to target.") -tf.app.flags.DEFINE_string("source_domain_string", "English", "") -tf.app.flags.DEFINE_string("target_domain_string", "French", "") +tf.flags.DEFINE_string("source_domain_string", "English", "") +tf.flags.DEFINE_string("target_domain_string", "French", "") -tf.app.flags.DEFINE_integer( +tf.flags.DEFINE_integer( "combine_to_length", 0, "If positive, concatenate examples to form examples with target length " " equal to this value. Targets are padded with subtoken id=0.") -tf.app.flags.DEFINE_string("in_file", "", "input filename") +tf.flags.DEFINE_string("in_file", "", "input filename") -tf.app.flags.DEFINE_string( +tf.flags.DEFINE_string( "out_prefix", "/usr/local/google/tmp/concat", "The output filename is equal to out_prefix plus " "the last 15 characters of in_file. (e.g. -00001-of-00100)") -FLAGS = tf.app.flags.FLAGS +FLAGS = tf.flags.FLAGS def _make_example(ids, weights, raw_num_bytes): diff --git a/tensor2tensor/data_generators/inspect.py b/tensor2tensor/data_generators/inspect.py index 6ba054d3c..848b74a2d 100644 --- a/tensor2tensor/data_generators/inspect.py +++ b/tensor2tensor/data_generators/inspect.py @@ -32,19 +32,16 @@ import tensorflow as tf -tf.app.flags.DEFINE_string("subword_text_encoder_filename", "", - "SubwordTextEncoder vocabulary file") -tf.app.flags.DEFINE_string("token_text_encoder_filename", "", - "TokenTextEncoder vocabulary file") -tf.app.flags.DEFINE_bool("byte_text_encoder", False, - "use a ByteTextEncoder") -tf.app.flags.DEFINE_string("input_filename", "", "input filename") -tf.app.flags.DEFINE_bool("print_inputs", False, - "Print decoded inputs to stdout") -tf.app.flags.DEFINE_bool("print_targets", False, - "Print decoded targets to stdout") +tf.flags.DEFINE_string("subword_text_encoder_filename", "", + "SubwordTextEncoder vocabulary file") +tf.flags.DEFINE_string("token_text_encoder_filename", "", + "TokenTextEncoder vocabulary file") +tf.flags.DEFINE_bool("byte_text_encoder", False, "use a ByteTextEncoder") +tf.flags.DEFINE_string("input_filename", "", "input filename") +tf.flags.DEFINE_bool("print_inputs", False, "Print decoded inputs to stdout") +tf.flags.DEFINE_bool("print_targets", False, "Print decoded targets to stdout") -FLAGS = tf.app.flags.FLAGS +FLAGS = tf.flags.FLAGS def main(_): @@ -53,8 +50,7 @@ def main(_): encoder = text_encoder.SubwordTextEncoder( FLAGS.subword_text_encoder_filename) elif FLAGS.token_text_encoder_filename: - encoder = text_encoder.TokenTextEncoder( - FLAGS.token_text_encoder_filename) + encoder = text_encoder.TokenTextEncoder(FLAGS.token_text_encoder_filename) elif FLAGS.byte_text_encoder: encoder = text_encoder.ByteTextEncoder() else: diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py index 88dfac116..47e82a176 100644 --- a/tensor2tensor/data_generators/text_encoder_build_subword.py +++ b/tensor2tensor/data_generators/text_encoder_build_subword.py @@ -39,19 +39,18 @@ import tensorflow as tf -tf.app.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder', - 'where to store the SubwordTextEncoder') -tf.app.flags.DEFINE_string('corpus_filepattern', '', - 'Corpus of one or more text files') -tf.app.flags.DEFINE_string('vocab_filepattern', '', - 'One or more vocabulary files ' - '(one word per line as "word,count")') -tf.app.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus') -tf.app.flags.DEFINE_integer('corpus_max_lines', 10000, - 'How many lines of corpus to read') -tf.app.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations') -tf.app.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.') -FLAGS = tf.app.flags.FLAGS +tf.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder', + 'where to store the SubwordTextEncoder') +tf.flags.DEFINE_string('corpus_filepattern', '', + 'Corpus of one or more text files') +tf.flags.DEFINE_string('vocab_filepattern', '', 'One or more vocabulary files ' + '(one word per line as "word,count")') +tf.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus') +tf.flags.DEFINE_integer('corpus_max_lines', 10000, + 'How many lines of corpus to read') +tf.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations') +tf.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.') +FLAGS = tf.flags.FLAGS def main(unused_argv): @@ -61,20 +60,21 @@ def main(unused_argv): elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( - FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, + FLAGS.corpus_filepattern, + FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: - token_counts = tokenizer.vocab_token_counts( - FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) + token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, + FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() - encoder.build_from_token_counts( - token_counts, FLAGS.min_count, FLAGS.num_iterations) + encoder.build_from_token_counts(token_counts, FLAGS.min_count, + FLAGS.num_iterations) encoder.store_to_file(FLAGS.output_fn) diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py index 5cb9fd32b..0e8daa75f 100644 --- a/tensor2tensor/data_generators/tokenizer.py +++ b/tensor2tensor/data_generators/tokenizer.py @@ -121,7 +121,7 @@ def _read_filepattern(filepattern, max_lines=None, split_on_newlines=True): The contents of the files as lines, if split_on_newlines is True, or the entire contents of each file if False. """ - filenames = tf.gfile.Glob(filepattern) + filenames = sorted(tf.gfile.Glob(filepattern)) lines_read = 0 for filename in filenames: with tf.gfile.Open(filename) as f: diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py index ad4a3ff04..0c299bd0b 100644 --- a/tensor2tensor/data_generators/tokenizer_test.py +++ b/tensor2tensor/data_generators/tokenizer_test.py @@ -30,9 +30,10 @@ from tensor2tensor.data_generators import tokenizer import tensorflow as tf -FLAGS = tf.app.flags.FLAGS +FLAGS = tf.flags.FLAGS -_TESTDATA = "google3/third_party/py/tensor2tensor/data_generators/test_data" +pkg_dir, _ = os.path.split(__file__) +_TESTDATA = os.path.join(pkg_dir, "test_data") class TokenizerTest(tf.test.TestCase): @@ -41,18 +42,13 @@ def test_encode(self): self.assertListEqual( [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."], tokenizer.encode(u"Dude - that's so cool.")) - self.assertListEqual( - [u"Łukasz", u"est", u"né", u"en", u"1981", u"."], - tokenizer.encode(u"Łukasz est né en 1981.")) - self.assertListEqual( - [u" ", u"Spaces", u"at", u"the", u"ends", u" "], - tokenizer.encode(u" Spaces at the ends ")) - self.assertListEqual( - [u"802", u".", u"11b"], - tokenizer.encode(u"802.11b")) - self.assertListEqual( - [u"two", u". \n", u"lines"], - tokenizer.encode(u"two. \nlines")) + self.assertListEqual([u"Łukasz", u"est", u"né", u"en", u"1981", u"."], + tokenizer.encode(u"Łukasz est né en 1981.")) + self.assertListEqual([u" ", u"Spaces", u"at", u"the", u"ends", u" "], + tokenizer.encode(u" Spaces at the ends ")) + self.assertListEqual([u"802", u".", u"11b"], tokenizer.encode(u"802.11b")) + self.assertListEqual([u"two", u". \n", u"lines"], + tokenizer.encode(u"two. \nlines")) def test_decode(self): self.assertEqual( @@ -62,8 +58,7 @@ def test_decode(self): def test_invertibility_on_random_strings(self): for _ in xrange(1000): - s = u"".join( - six.unichr(random.randint(0, 65535)) for _ in xrange(10)) + s = u"".join(six.unichr(random.randint(0, 65535)) for _ in xrange(10)) self.assertEqual(s, tokenizer.decode(tokenizer.encode(s))) @@ -71,10 +66,8 @@ class TestTokenCounts(tf.test.TestCase): def setUp(self): super(TestTokenCounts, self).setUp() - self.corpus_path = os.path.join( - FLAGS.test_srcdir, _TESTDATA, "corpus-*.txt") - self.vocab_path = os.path.join( - FLAGS.test_srcdir, _TESTDATA, "vocab-*.txt") + self.corpus_path = os.path.join(_TESTDATA, "corpus-*.txt") + self.vocab_path = os.path.join(_TESTDATA, "vocab-*.txt") def test_corpus_token_counts_split_on_newlines(self): token_counts = tokenizer.corpus_token_counts( @@ -117,31 +110,33 @@ def test_corpus_token_counts_no_split_with_max_lines(self): self.assertIn(u"slept", token_counts) self.assertNotIn(u"Mitch", token_counts) - self.assertDictContainsSubset( - {u".\n\n": 1, u"\n": 2, u".\n": 1}, token_counts) + self.assertDictContainsSubset({ + u".\n\n": 1, + u"\n": 2, + u".\n": 1 + }, token_counts) def test_vocab_token_counts(self): - token_counts = tokenizer.vocab_token_counts( - self.vocab_path, 0) + token_counts = tokenizer.vocab_token_counts(self.vocab_path, 0) expected = { - "lollipop": 8, - "reverberated": 12, - "kattywampus": 11, - "balderdash": 10, - "jiggery-pokery": 14, + u"lollipop": 8, + u"reverberated": 12, + u"kattywampus": 11, + u"balderdash": 10, + u"jiggery-pokery": 14, } self.assertDictEqual(expected, token_counts) def test_vocab_token_counts_with_max_lines(self): - token_counts = tokenizer.vocab_token_counts( - self.vocab_path, 4) + # vocab-1 has 2 lines, vocab-2 has 3 + token_counts = tokenizer.vocab_token_counts(self.vocab_path, 4) expected = { - "lollipop": 8, - "reverberated": 12, - "kattywampus": 11, - "balderdash": 10, + u"lollipop": 8, + u"reverberated": 12, + u"kattywampus": 11, + u"balderdash": 10, } self.assertDictEqual(expected, token_counts) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 1dbb84d4f..bf105c5ae 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -167,7 +167,7 @@ def create_experiment(output_dir, data_dir, model_name, train_steps, model_name=model_name) eval_metrics = metrics.create_evaluation_metrics( zip(FLAGS.problems.split("-"), hparams.problem_instances)) - if ("autotune" in FLAGS and FLAGS.autotune and + if (hasattr(FLAGS, "autotune") and FLAGS.autotune and FLAGS.objective not in eval_metrics): raise ValueError("Tuning objective %s not among evaluation metrics %s" % (FLAGS.objective, eval_metrics.keys())) @@ -572,7 +572,7 @@ def nth_model(n): # Define the train_op for the TRAIN mode. opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams) tf.logging.info("Computing gradients for global model_fn.") - opt_summaries = ["learning_rate", "loss", "global_gradient_norm"] + opt_summaries = ["learning_rate", "loss"] if hparams.summarize_grads: opt_summaries.extend(["gradients", "gradient_norm"]) train_op = tf.contrib.layers.optimize_loss(