From 13b02cad9ae8fcc34ac0da20ba55734c09fe14eb Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Fri, 21 Jul 2017 12:30:48 -0700
Subject: [PATCH 01/21] Refactoring of get_or_generate_vocab* functions.

PiperOrigin-RevId: 162771691
---
 tensor2tensor/__init__.py                     |   1 +
 tensor2tensor/bin/t2t-datagen                 |   1 +
 tensor2tensor/bin/t2t-make-tf-configs         |   1 +
 tensor2tensor/bin/t2t-trainer                 |   1 +
 tensor2tensor/data_generators/__init__.py     |   1 +
 tensor2tensor/data_generators/algorithmic.py  |   1 +
 .../data_generators/algorithmic_math.py       |   1 +
 .../data_generators/algorithmic_math_test.py  |   1 +
 .../data_generators/algorithmic_test.py       |   1 +
 tensor2tensor/data_generators/all_problems.py |   1 +
 tensor2tensor/data_generators/audio.py        |   1 +
 tensor2tensor/data_generators/audio_test.py   |   1 +
 .../data_generators/concatenate_examples.py   |   1 +
 .../data_generators/generator_utils.py        | 148 +++++----
 .../data_generators/generator_utils_test.py   |  22 ++
 tensor2tensor/data_generators/genetics.py     |   1 +
 .../data_generators/genetics_test.py          |   1 +
 tensor2tensor/data_generators/image.py        |   1 +
 tensor2tensor/data_generators/image_test.py   |   1 +
 tensor2tensor/data_generators/inspect.py      |   1 +
 tensor2tensor/data_generators/lm1b.py         |   1 +
 tensor2tensor/data_generators/problem.py      |   5 +-
 .../data_generators/problem_hparams.py        |   4 +-
 .../data_generators/problem_hparams_test.py   |   1 +
 tensor2tensor/data_generators/ptb.py          |   1 +
 tensor2tensor/data_generators/snli.py         |   1 +
 tensor2tensor/data_generators/text_encoder.py |   1 +
 .../text_encoder_build_subword.py             |   1 +
 tensor2tensor/data_generators/tokenizer.py    |   1 +
 .../data_generators/tokenizer_test.py         |   1 +
 tensor2tensor/data_generators/wiki.py         |   1 +
 tensor2tensor/data_generators/wmt.py          | 305 ++++++++----------
 tensor2tensor/data_generators/wmt_test.py     |   1 +
 tensor2tensor/data_generators/wsj_parsing.py  |   1 +
 tensor2tensor/models/__init__.py              |   1 +
 tensor2tensor/models/attention_lm.py          |   1 +
 tensor2tensor/models/attention_lm_moe.py      |   1 +
 tensor2tensor/models/bluenet.py               |   1 +
 tensor2tensor/models/bluenet_test.py          |   1 +
 tensor2tensor/models/bytenet.py               |   1 +
 tensor2tensor/models/bytenet_test.py          |   1 +
 tensor2tensor/models/common_attention.py      |   4 +-
 tensor2tensor/models/common_hparams.py        |   1 +
 tensor2tensor/models/common_layers.py         |   1 +
 tensor2tensor/models/common_layers_test.py    |   1 +
 tensor2tensor/models/long_answer.py           |   1 +
 tensor2tensor/models/lstm.py                  |   1 +
 tensor2tensor/models/lstm_test.py             |   1 +
 tensor2tensor/models/modalities.py            |   1 +
 tensor2tensor/models/modalities_test.py       |   1 +
 tensor2tensor/models/models.py                |   1 +
 tensor2tensor/models/multimodel.py            |   1 +
 tensor2tensor/models/multimodel_test.py       |   1 +
 tensor2tensor/models/neural_gpu.py            |   1 +
 tensor2tensor/models/neural_gpu_test.py       |   1 +
 tensor2tensor/models/shake_shake.py           |   1 +
 tensor2tensor/models/slicenet.py              |   1 +
 tensor2tensor/models/slicenet_test.py         |   1 +
 tensor2tensor/models/transformer.py           |   1 +
 .../models/transformer_alternative.py         |   1 +
 tensor2tensor/models/transformer_test.py      |   1 +
 tensor2tensor/models/xception.py              |   1 +
 tensor2tensor/models/xception_test.py         |   1 +
 tensor2tensor/utils/__init__.py               |   1 +
 tensor2tensor/utils/avg_checkpoints.py        |   1 +
 tensor2tensor/utils/beam_search.py            |   1 +
 tensor2tensor/utils/beam_search_test.py       |   1 +
 tensor2tensor/utils/bleu_hook.py              |   1 +
 tensor2tensor/utils/bleu_hook_test.py         |   1 +
 tensor2tensor/utils/data_reader.py            |   1 +
 tensor2tensor/utils/data_reader_test.py       |   1 +
 tensor2tensor/utils/expert_utils.py           |   1 +
 tensor2tensor/utils/get_ende_bleu.sh          |   4 +-
 tensor2tensor/utils/metrics.py                |   1 +
 tensor2tensor/utils/metrics_test.py           |   1 +
 tensor2tensor/utils/modality.py               |   1 +
 tensor2tensor/utils/registry.py               |   1 +
 tensor2tensor/utils/registry_test.py          |   1 +
 tensor2tensor/utils/t2t_model.py              |   1 +
 tensor2tensor/utils/trainer_utils.py          |  17 +-
 tensor2tensor/utils/trainer_utils_test.py     |   1 +
 tensor2tensor/utils/usr_dir.py                |   1 +
 tensor2tensor/utils/yellowfin.py              |   1 +
 tensor2tensor/utils/yellowfin_test.py         |   1 +
 84 files changed, 341 insertions(+), 244 deletions(-)

diff --git a/tensor2tensor/__init__.py b/tensor2tensor/__init__.py
index eff6a2b14..3f714ce1f 100644
--- a/tensor2tensor/__init__.py
+++ b/tensor2tensor/__init__.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index af5b47f8c..57e2b17fb 100644
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/bin/t2t-make-tf-configs b/tensor2tensor/bin/t2t-make-tf-configs
index 6a4dc8641..0b656aba6 100644
--- a/tensor2tensor/bin/t2t-make-tf-configs
+++ b/tensor2tensor/bin/t2t-make-tf-configs
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
index a37767258..13dd7d355 100644
--- a/tensor2tensor/bin/t2t-trainer
+++ b/tensor2tensor/bin/t2t-trainer
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/__init__.py b/tensor2tensor/data_generators/__init__.py
index eff6a2b14..3f714ce1f 100644
--- a/tensor2tensor/data_generators/__init__.py
+++ b/tensor2tensor/data_generators/__init__.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py
index 2169e1910..676b4e45f 100644
--- a/tensor2tensor/data_generators/algorithmic.py
+++ b/tensor2tensor/data_generators/algorithmic.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py
index e65b47ff0..e061ceb0b 100644
--- a/tensor2tensor/data_generators/algorithmic_math.py
+++ b/tensor2tensor/data_generators/algorithmic_math.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/algorithmic_math_test.py b/tensor2tensor/data_generators/algorithmic_math_test.py
index 5f0de29fb..7cd67a83c 100644
--- a/tensor2tensor/data_generators/algorithmic_math_test.py
+++ b/tensor2tensor/data_generators/algorithmic_math_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py
index fb8ff6719..57faaa80b 100644
--- a/tensor2tensor/data_generators/algorithmic_test.py
+++ b/tensor2tensor/data_generators/algorithmic_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
index 93a8a06a2..d8007f5e3 100644
--- a/tensor2tensor/data_generators/all_problems.py
+++ b/tensor2tensor/data_generators/all_problems.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/audio.py b/tensor2tensor/data_generators/audio.py
index 4f8c096a5..d0747a88c 100644
--- a/tensor2tensor/data_generators/audio.py
+++ b/tensor2tensor/data_generators/audio.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/audio_test.py b/tensor2tensor/data_generators/audio_test.py
index 1c19432c3..57e4e1ccc 100644
--- a/tensor2tensor/data_generators/audio_test.py
+++ b/tensor2tensor/data_generators/audio_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py
index 158bc1b59..60ac7ea8f 100644
--- a/tensor2tensor/data_generators/concatenate_examples.py
+++ b/tensor2tensor/data_generators/concatenate_examples.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index 5c0c94bce..866a0f3e7 100644
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -247,53 +248,19 @@ def gunzip_file(gz_path, new_path):
 ]
 
 
-def get_or_generate_vocab(data_dir, tmp_dir,
-                          vocab_filename, vocab_size, sources=None):
-  """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
+def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
+                                generator_fn):
+  """Inner implementation for vocab generators."""
   vocab_filepath = os.path.join(data_dir, vocab_filename)
   if tf.gfile.Exists(vocab_filepath):
     tf.logging.info("Found vocab file: %s", vocab_filepath)
     vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
     return vocab
 
-  sources = sources or _DATA_FILE_URLS
-  tf.logging.info("Generating vocab from: %s", str(sources))
   token_counts = defaultdict(int)
-  for source in sources:
-    url = source[0]
-    filename = os.path.basename(url)
-    read_type = "r:gz" if "tgz" in filename else "r"
-
-    compressed_file = maybe_download(tmp_dir, filename, url)
-
-    with tarfile.open(compressed_file, read_type) as corpus_tar:
-      corpus_tar.extractall(tmp_dir)
-
-    for lang_file in source[1]:
-      tf.logging.info("Reading file: %s" % lang_file)
-      filepath = os.path.join(tmp_dir, lang_file)
-
-      # For some datasets a second extraction is necessary.
-      if ".gz" in lang_file:
-        new_filepath = os.path.join(tmp_dir, lang_file[:-3])
-        if tf.gfile.Exists(new_filepath):
-          tf.logging.info(
-              "Subdirectory %s already exists, skipping unpacking" % filepath)
-        else:
-          tf.logging.info("Unpacking subdirectory %s" % filepath)
-          gunzip_file(filepath, new_filepath)
-        filepath = new_filepath
-
-      # Use Tokenizer to count the word occurrences.
-      with tf.gfile.GFile(filepath, mode="r") as source_file:
-        file_byte_budget = 3.5e5 if "en" in filepath else 7e5
-        for line in source_file:
-          if file_byte_budget <= 0:
-            break
-          line = line.strip()
-          file_byte_budget -= len(line)
-          for tok in tokenizer.encode(text_encoder.native_to_unicode(line)):
-            token_counts[tok] += 1
+  for item in generator_fn():
+    for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
+      token_counts[tok] += 1
 
   vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
       vocab_size, token_counts, 1, 1e3)
@@ -301,6 +268,55 @@ def get_or_generate_vocab(data_dir, tmp_dir,
   return vocab
 
 
+def get_or_generate_vocab(data_dir,
+                          tmp_dir,
+                          vocab_filename,
+                          vocab_size,
+                          sources=None):
+  """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
+  sources = sources or _DATA_FILE_URLS
+
+  def generate():
+    tf.logging.info("Generating vocab from: %s", str(sources))
+    for source in sources:
+      url = source[0]
+      filename = os.path.basename(url)
+      read_type = "r:gz" if "tgz" in filename else "r"
+
+      compressed_file = maybe_download(tmp_dir, filename, url)
+
+      with tarfile.open(compressed_file, read_type) as corpus_tar:
+        corpus_tar.extractall(tmp_dir)
+
+      for lang_file in source[1]:
+        tf.logging.info("Reading file: %s" % lang_file)
+        filepath = os.path.join(tmp_dir, lang_file)
+
+        # For some datasets a second extraction is necessary.
+        if ".gz" in lang_file:
+          new_filepath = os.path.join(tmp_dir, lang_file[:-3])
+          if tf.gfile.Exists(new_filepath):
+            tf.logging.info(
+                "Subdirectory %s already exists, skipping unpacking" % filepath)
+          else:
+            tf.logging.info("Unpacking subdirectory %s" % filepath)
+            gunzip_file(filepath, new_filepath)
+          filepath = new_filepath
+
+        # Use Tokenizer to count the word occurrences.
+        with tf.gfile.GFile(filepath, mode="r") as source_file:
+          file_byte_budget = 3.5e5 if "en" in filepath else 7e5
+          for line in source_file:
+            if file_byte_budget <= 0:
+              break
+            line = line.strip()
+            file_byte_budget -= len(line)
+            yield line
+
+  return get_or_generate_vocab_inner(
+      data_dir, vocab_filename, vocab_size, generator_fn=generate)
+
+
 def get_or_generate_tabbed_vocab(data_dir, tmp_dir, source_filename,
                                  index, vocab_filename, vocab_size):
   r"""Generate a vocabulary from a tabbed source file.
@@ -320,27 +336,37 @@ def get_or_generate_tabbed_vocab(data_dir, tmp_dir, source_filename,
   Returns:
     The vocabulary.
   """
-  vocab_filepath = os.path.join(data_dir, vocab_filename)
-  if os.path.exists(vocab_filepath):
-    vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
-    return vocab
-
-  # Use Tokenizer to count the word occurrences.
-  token_counts = defaultdict(int)
-  filepath = os.path.join(tmp_dir, source_filename)
-  with tf.gfile.GFile(filepath, mode="r") as source_file:
-    for line in source_file:
-      line = line.strip()
-      if line and "\t" in line:
-        parts = line.split("\t", maxsplit=1)
-        part = parts[index].strip()
-        for tok in tokenizer.encode(text_encoder.native_to_unicode(part)):
-          token_counts[tok] += 1
-
-  vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
-      vocab_size, token_counts, 1, 1e3)
-  vocab.store_to_file(vocab_filepath)
-  return vocab
+  def generate():
+    filepath = os.path.join(tmp_dir, source_filename)
+    tf.logging.info("Generating vocab from %s", filepath)
+    with tf.gfile.GFile(filepath, mode="r") as source_file:
+      for line in source_file:
+        line = line.strip()
+        if line and "\t" in line:
+          parts = line.split("\t", maxsplit=1)
+          part = parts[index].strip()
+          yield part
+
+  return get_or_generate_vocab_inner(
+      data_dir, vocab_filename, vocab_size, generator_fn=generate)
+
+
+def get_or_generate_txt_vocab(data_dir, vocab_filename, vocab_size,
+                              filepatterns):
+  """Generate a vocabulary from txt files with example-per-line."""
+  if isinstance(filepatterns, str):
+    filepatterns = [filepatterns]
+
+  def generate():
+    tf.logging.info("Generating vocab from %s", filepatterns)
+    for filepattern in filepatterns:
+      for filename in tf.gfile.Glob(filepattern):
+        with tf.gfile.GFile(filename, mode="r") as source_file:
+          for line in source_file:
+            yield line.strip()
+
+  return get_or_generate_vocab_inner(
+      data_dir, vocab_filename, vocab_size, generator_fn=generate)
 
 
 def read_records(filename):
diff --git a/tensor2tensor/data_generators/generator_utils_test.py b/tensor2tensor/data_generators/generator_utils_test.py
index c776d120c..fd6e15ca3 100644
--- a/tensor2tensor/data_generators/generator_utils_test.py
+++ b/tensor2tensor/data_generators/generator_utils_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -84,6 +85,27 @@ def testGunzipFile(self):
     os.remove(tmp_file_path + ".txt")
     os.remove(tmp_file_path)
 
+  def testGetOrGenerateTxtVocab(self):
+    data_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    test_file = os.path.join(self.get_temp_dir(), "test.txt")
+    with tf.gfile.Open(test_file, "w") as outfile:
+      outfile.write("a b c\n")
+      outfile.write("d e f\n")
+    # Create a vocab over the test file.
+    vocab1 = generator_utils.get_or_generate_txt_vocab(
+        data_dir, "test.voc", 20, test_file)
+    self.assertTrue(tf.gfile.Exists(os.path.join(data_dir, "test.voc")))
+    self.assertIsNotNone(vocab1)
+
+    # Append a new line to the test file which would change the vocab if
+    # the vocab were not being read from file.
+    with tf.gfile.Open(test_file, "a") as outfile:
+      outfile.write("g h i\n")
+    vocab2 = generator_utils.get_or_generate_txt_vocab(
+        data_dir, "test.voc", 20, test_file)
+    self.assertTrue(tf.gfile.Exists(os.path.join(data_dir, "test.voc")))
+    self.assertIsNotNone(vocab2)
+    self.assertEqual(vocab1.dump(), vocab2.dump())
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py
index 255e0caf9..b4ad36544 100644
--- a/tensor2tensor/data_generators/genetics.py
+++ b/tensor2tensor/data_generators/genetics.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/genetics_test.py b/tensor2tensor/data_generators/genetics_test.py
index 70b4fe495..85d70f934 100644
--- a/tensor2tensor/data_generators/genetics_test.py
+++ b/tensor2tensor/data_generators/genetics_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
index e3567d78f..f8e3191a2 100644
--- a/tensor2tensor/data_generators/image.py
+++ b/tensor2tensor/data_generators/image.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/image_test.py b/tensor2tensor/data_generators/image_test.py
index 6c9984265..59cad4226 100644
--- a/tensor2tensor/data_generators/image_test.py
+++ b/tensor2tensor/data_generators/image_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/inspect.py b/tensor2tensor/data_generators/inspect.py
index dad0c1c83..124c07017 100644
--- a/tensor2tensor/data_generators/inspect.py
+++ b/tensor2tensor/data_generators/inspect.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py
index 78fb001bc..562435184 100644
--- a/tensor2tensor/data_generators/lm1b.py
+++ b/tensor2tensor/data_generators/lm1b.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 5beb0385f..690f14277 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -69,10 +70,6 @@ class SpaceID(object):
   ICE_PARSE_TOK = 19
   # Macedonian tokens
   MK_TOK = 20
-  # Czech tokens
-  CS_TOK = 21
-  # Czech characters
-  CS_CHR = 22
 
 
 class Problem(object):
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index 51bc0ba62..4343afd27 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -180,9 +181,6 @@ def default_problem_hparams():
       #   17: Icelandic characters
       #   18: Icelandic tokens
       #   19: Icelandic parse tokens
-      #   20: Macedonian tokens
-      #   21: Czech tokens
-      #   22: Czech characters
       # Add more above if needed.
       input_space_id=0,
       target_space_id=0,
diff --git a/tensor2tensor/data_generators/problem_hparams_test.py b/tensor2tensor/data_generators/problem_hparams_test.py
index ad1f0192d..df92919ef 100644
--- a/tensor2tensor/data_generators/problem_hparams_test.py
+++ b/tensor2tensor/data_generators/problem_hparams_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py
index 9a7db3a78..f71f0d902 100644
--- a/tensor2tensor/data_generators/ptb.py
+++ b/tensor2tensor/data_generators/ptb.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/snli.py b/tensor2tensor/data_generators/snli.py
index 7322c59ff..cd4ff723d 100644
--- a/tensor2tensor/data_generators/snli.py
+++ b/tensor2tensor/data_generators/snli.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 8be22ce0b..7c53784f3 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py
index 093101c68..a0d5d8937 100644
--- a/tensor2tensor/data_generators/text_encoder_build_subword.py
+++ b/tensor2tensor/data_generators/text_encoder_build_subword.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
index 2b1cf572c..d1faaa7b3 100644
--- a/tensor2tensor/data_generators/tokenizer.py
+++ b/tensor2tensor/data_generators/tokenizer.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py
index c279290ed..189f19663 100644
--- a/tensor2tensor/data_generators/tokenizer_test.py
+++ b/tensor2tensor/data_generators/tokenizer_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py
index 8f905aa96..49147962a 100644
--- a/tensor2tensor/data_generators/wiki.py
+++ b/tensor2tensor/data_generators/wiki.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index 456970e62..bb31d0c0f 100644
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -42,6 +43,23 @@
 EOS = text_encoder.EOS_ID
 
 
+def _default_token_feature_encoders(data_dir, target_vocab_size):
+  vocab_filename = os.path.join(data_dir,
+                                "vocab.endefr.%d" % target_vocab_size)
+  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
+  return {
+      "inputs": subtokenizer,
+      "targets": subtokenizer,
+  }
+
+
+def _default_character_feature_encoders():
+  return {
+      "inputs": text_encoder.ByteTextEncoder(),
+      "targets": text_encoder.ByteTextEncoder(),
+  }
+
+
 class WMTProblem(problem.Problem):
   """Base class for WMT problems."""
 
@@ -53,13 +71,14 @@ def is_character_level(self):
   def targeted_vocab_size(self):
     raise NotImplementedError()  # Not needed if self.is_character_level.
 
-  def train_generator(self, data_dir, tmp_dir, is_training):
-    """Generator of the training data."""
+  @property
+  def train_generator(self):
+    """Generator; takes data_dir, tmp_dir, is_training, targeted_vocab_size."""
     raise NotImplementedError()
 
-  def dev_generator(self, data_dir, tmp_dir, is_training):
-    """Generator of the development data."""
-    return self.train_generator(data_dir, tmp_dir, is_training)
+  @property
+  def dev_generator(self):
+    return self.train_generator
 
   @property
   def input_space_id(self):
@@ -73,35 +92,28 @@ def target_space_id(self):
   def num_shards(self):
     return 100
 
-  @property
-  def vocab_name(self):
-    return "vocab.endefr"
-
-  @property
-  def vocab_file(self):
-    return "%s.%d" % (self.vocab_name, self.targeted_vocab_size)
-
   def generate_data(self, data_dir, tmp_dir, num_shards=None):
     if num_shards is None:
       num_shards = self.num_shards
-    generator_utils.generate_dataset_and_shuffle(
-      self.train_generator(data_dir, tmp_dir, True),
-      self.training_filepaths(data_dir, num_shards, shuffled=False),
-      self.dev_generator(data_dir, tmp_dir, False),
-      self.dev_filepaths(data_dir, 1, shuffled=False))
+    if self.is_character_level:
+      generator_utils.generate_dataset_and_shuffle(
+          self.train_generator(tmp_dir, True),
+          self.training_filepaths(data_dir, num_shards, shuffled=False),
+          self.dev_generator(tmp_dir, False),
+          self.dev_filepaths(data_dir, 1, shuffled=False))
+    else:
+      generator_utils.generate_dataset_and_shuffle(
+          self.train_generator(data_dir, tmp_dir, True,
+                               self.targeted_vocab_size),
+          self.training_filepaths(data_dir, num_shards, shuffled=False),
+          self.dev_generator(data_dir, tmp_dir, False,
+                             self.targeted_vocab_size),
+          self.dev_filepaths(data_dir, 1, shuffled=False))
 
   def feature_encoders(self, data_dir):
     if self.is_character_level:
-      return {
-        "inputs": text_encoder.ByteTextEncoder(),
-        "targets": text_encoder.ByteTextEncoder(),
-      }
-    vocab_filename = os.path.join(data_dir, self.vocab_file)
-    subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
-    return {
-      "inputs": subtokenizer,
-      "targets": subtokenizer,
-    }
+      return _default_character_feature_encoders()
+    return _default_token_feature_encoders(data_dir, self.targeted_vocab_size)
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
@@ -163,8 +175,8 @@ def tabbed_generator(source_path, source_vocab, target_vocab, eos=None):
 
   Args:
     source_path: path to the file with source and target sentences.
-    source_vocab: a SubwordTextEncoder to encode the source string.
-    target_vocab: a SubwordTextEncoder to encode the target string.
+    source_vocab: a SunwordTextEncoder to encode the source string.
+    target_vocab: a SunwordTextEncoder to encode the target string.
     eos: integer to append at the end of each sequence (default: None).
 
   Yields:
@@ -325,29 +337,6 @@ def bi_vocabs_token_generator(source_path,
     ("dev.mk", "dev.en")
 ]]
 
-# English-Czech datasets
-_ENCS_TRAIN_DATASETS = [
-    [
-        "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
-        ("training-parallel-nc-v11/news-commentary-v11.cs-en.en",
-         "training-parallel-nc-v11/news-commentary-v11.cs-en.cs")
-    ],
-    [
-        "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
-        ("commoncrawl.cs-en.en", "commoncrawl.cs-en.cs")
-    ],
-    [
-        "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
-        ("training/europarl-v7.cs-en.en", "training/europarl-v7.cs-en.cs")
-    ],
-]
-_ENCS_TEST_DATASETS = [
-    [
-        "http://data.statmt.org/wmt16/translation-task/dev.tgz",
-        ("dev/newstest2013.en", "dev/newstest2013.cs")
-    ],
-]
-
 
 # Generators.
 
@@ -419,6 +408,16 @@ def _compile_data(tmp_dir, datasets, filename):
   return filename
 
 
+def ende_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size):
+  symbolizer_vocab = generator_utils.get_or_generate_vocab(
+      data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size)
+  datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag)
+  return token_generator(data_path + ".lang1", data_path + ".lang2",
+                         symbolizer_vocab, EOS)
+
+
 @registry.register_problem("wmt_ende_tokens_8k")
 class WMTEnDeTokens8k(WMTProblem):
   """Problem spec for WMT En-De translation."""
@@ -427,13 +426,9 @@ class WMTEnDeTokens8k(WMTProblem):
   def targeted_vocab_size(self):
     return 2**13  # 8192
 
-  def train_generator(self, data_dir, tmp_dir, train):
-    symbolizer_vocab = generator_utils.get_or_generate_vocab(
-        data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size)
-    datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
-    tag = "train" if train else "dev"
-    data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag)
-    return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
+  @property
+  def train_generator(self):
+    return ende_wordpiece_token_generator
 
   @property
   def input_space_id(self):
@@ -452,6 +447,15 @@ def targeted_vocab_size(self):
     return 2**15  # 32768
 
 
+def ende_character_generator(tmp_dir, train):
+  character_vocab = text_encoder.ByteTextEncoder()
+  datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag)
+  return character_generator(data_path + ".lang1", data_path + ".lang2",
+                             character_vocab, EOS)
+
+
 @registry.register_problem("wmt_ende_characters")
 class WMTEnDeCharacters(WMTProblem):
   """Problem spec for WMT En-De translation."""
@@ -460,13 +464,9 @@ class WMTEnDeCharacters(WMTProblem):
   def is_character_level(self):
     return True
 
-  def train_generator(self, tmp_dir, train):
-    character_vocab = text_encoder.ByteTextEncoder()
-    datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
-    tag = "train" if train else "dev"
-    data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag)
-    return character_generator(data_path + ".lang1", data_path + ".lang2",
-                                character_vocab, EOS)
+  @property
+  def train_generator(self):
+    return ende_character_generator
 
   @property
   def input_space_id(self):
@@ -477,6 +477,29 @@ def target_space_id(self):
     return problem.SpaceID.DE_CHR
 
 
+def zhen_wordpiece_token_bigenerator(data_dir, tmp_dir, train,
+                                     source_vocab_size, target_vocab_size):
+  """Wordpiece generator for the WMT'17 zh-en dataset."""
+  datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
+  source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS]
+  target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS]
+  source_vocab = generator_utils.get_or_generate_vocab(
+      data_dir, tmp_dir, "vocab.zh.%d" % source_vocab_size,
+      source_vocab_size, source_datasets)
+  target_vocab = generator_utils.get_or_generate_vocab(
+      data_dir, tmp_dir, "vocab.en.%d" % target_vocab_size,
+      target_vocab_size, target_datasets)
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
+  return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
+                                   source_vocab, target_vocab, EOS)
+
+
+def zhen_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size):
+  return zhen_wordpiece_token_bigenerator(data_dir, tmp_dir, train,
+                                          vocab_size, vocab_size)
+
+
 @registry.register_problem("wmt_zhen_tokens_8k")
 class WMTZhEnTokens8k(WMTProblem):
   """Problem spec for WMT Zh-En translation."""
@@ -485,22 +508,9 @@ class WMTZhEnTokens8k(WMTProblem):
   def targeted_vocab_size(self):
     return 2**13  # 8192
 
-  def train_generator(self, data_dir, tmp_dir, train):
-    source_vocab_size = self.targeted_vocab_size
-    target_vocab_size = self.targeted_vocab_size
-    datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
-    source_datasets = [[item[0], [item[1][0]]] for item in datasets]
-    target_datasets = [[item[0], [item[1][1]]] for item in datasets]
-    source_vocab = generator_utils.get_or_generate_vocab(
-        data_dir, tmp_dir, "vocab.zh.%d" % source_vocab_size, source_vocab_size,
-        source_datasets)
-    target_vocab = generator_utils.get_or_generate_vocab(
-        data_dir, tmp_dir, "vocab.en.%d" % target_vocab_size, target_vocab_size,
-        target_datasets)
-    tag = "train" if train else "dev"
-    data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
-    return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
-                                    source_vocab, target_vocab, EOS)
+  @property
+  def train_generator(self):
+    return zhen_wordpiece_token_generator
 
   @property
   def input_space_id(self):
@@ -532,6 +542,17 @@ def targeted_vocab_size(self):
     return 2**15  # 32768
 
 
+def enfr_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size):
+  """Instance of token generator for the WMT en->fr task."""
+  symbolizer_vocab = generator_utils.get_or_generate_vocab(
+      data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size)
+  datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag)
+  return token_generator(data_path + ".lang1", data_path + ".lang2",
+                         symbolizer_vocab, EOS)
+
+
 @registry.register_problem("wmt_enfr_tokens_8k")
 class WMTEnFrTokens8k(WMTProblem):
   """Problem spec for WMT En-Fr translation."""
@@ -540,13 +561,9 @@ class WMTEnFrTokens8k(WMTProblem):
   def targeted_vocab_size(self):
     return 2**13  # 8192
 
-  def train_generator(self, tmp_dir, train):
-    symbolizer_vocab = generator_utils.get_or_generate_vocab(
-        data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size)
-    datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
-    tag = "train" if train else "dev"
-    data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag)
-    return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
+  @property
+  def train_generator(self):
+    return enfr_wordpiece_token_generator
 
   @property
   def input_space_id(self):
@@ -565,6 +582,16 @@ def targeted_vocab_size(self):
     return 2**15  # 32768
 
 
+def enfr_character_generator(tmp_dir, train):
+  """Instance of character generator for the WMT en->fr task."""
+  character_vocab = text_encoder.ByteTextEncoder()
+  datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag)
+  return character_generator(data_path + ".lang1", data_path + ".lang2",
+                             character_vocab, EOS)
+
+
 @registry.register_problem("wmt_enfr_characters")
 class WMTEnFrCharacters(WMTProblem):
   """Problem spec for WMT En-Fr translation."""
@@ -573,13 +600,9 @@ class WMTEnFrCharacters(WMTProblem):
   def is_character_level(self):
     return True
 
-  def train_generator(self, data_dir, tmp_dir, train):
-    character_vocab = text_encoder.ByteTextEncoder()
-    datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
-    tag = "train" if train else "dev"
-    data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag)
-    return character_generator(data_path + ".lang1", data_path + ".lang2",
-                                character_vocab, EOS)
+  @property
+  def train_generator(self):
+    return enfr_character_generator
 
   @property
   def input_space_id(self):
@@ -590,6 +613,20 @@ def target_space_id(self):
     return problem.SpaceID.FR_CHR
 
 
+def mken_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size):
+  """Wordpiece generator for the SETimes Mk-En dataset."""
+  datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
+  source_datasets = [[item[0], [item[1][0]]] for item in _MKEN_TRAIN_DATASETS]
+  target_datasets = [[item[0], [item[1][1]]] for item in _MKEN_TRAIN_DATASETS]
+  symbolizer_vocab = generator_utils.get_or_generate_vocab(
+      data_dir, tmp_dir, "vocab.mken.%d" % vocab_size, vocab_size,
+      source_datasets + target_datasets)
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag)
+  return token_generator(data_path + ".lang1", data_path + ".lang2",
+                         symbolizer_vocab, EOS)
+
+
 @registry.register_problem("setimes_mken_tokens_32k")
 class SETimesMkEnTokens32k(WMTProblem):
   """Problem spec for SETimes Mk-En translation."""
@@ -599,20 +636,8 @@ def targeted_vocab_size(self):
     return 2**15  # 32768
 
   @property
-  def vocab_name(self):
-    return "vocab.mken"
-
-  def train_generator(self, data_dir, tmp_dir, train):
-    datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
-    source_datasets = [[item[0], [item[1][0]]] for item in datasets]
-    target_datasets = [[item[0], [item[1][1]]] for item in datasets]
-    symbolizer_vocab = generator_utils.get_or_generate_vocab(
-        data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
-        source_datasets + target_datasets)
-    tag = "train" if train else "dev"
-    data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag)
-    return token_generator(data_path + ".lang1", data_path + ".lang2",
-                            symbolizer_vocab, EOS)
+  def train_generator(self):
+    return mken_wordpiece_token_generator
 
   @property
   def input_space_id(self):
@@ -622,65 +647,7 @@ def input_space_id(self):
   def target_space_id(self):
     return problem.SpaceID.EN_TOK
 
-@registry.register_problem("wmt_encs_tokens_32k")
-class WMTEnCsTokens32k(problem.Problem):
-  """Problem spec for WMT English-Czech translation."""
-
-  @property
-  def target_vocab_size(self):
-    return 2**15  # 32768
-
-  @property
-  def vocab_name(self):
-    return "vocab.encs"
-
-  def train_generator(self, data_dir, tmp_dir, train):
-    datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
-    source_datasets = [[item[0], [item[1][0]]] for item in datasets]
-    target_datasets = [[item[0], [item[1][1]]] for item in datasets]
-    symbolizer_vocab = generator_utils.get_or_generate_vocab(
-        data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
-        source_datasets + target_datasets)
-    tag = "train" if train else "dev"
-    data_path = _compile_data(tmp_dir, datasets, "wmt_encs_tok_%s" % tag)
-    return token_generator(data_path + ".lang1", data_path + ".lang2",
-                            symbolizer_vocab, EOS)
-
-  @property
-  def input_space_id(self):
-    return problem.SpaceID.EN_TOK
-
-  @property
-  def target_space_id(self):
-    return problem.SpaceID.CS_TOK
-
-
-@registry.register_problem("wmt_encs_characters")
-class WMTEnCsCharacters(WMTProblem):
-  """Problem spec for WMT En-Cs character-based translation."""
-
-  @property
-  def is_character_level(self):
-    return True
-
-  def train_generator(self, data_dir, tmp_dir, train):
-    character_vocab = text_encoder.ByteTextEncoder()
-    datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
-    tag = "train" if train else "dev"
-    data_path = _compile_data(tmp_dir, datasets, "wmt_encs_chr_%s" % tag)
-    return character_generator(data_path + ".lang1", data_path + ".lang2",
-                                character_vocab, EOS)
-
-  @property
-  def input_space_id(self):
-    return problem.SpaceID.EN_CHR
-
-  @property
-  def target_space_id(self):
-    return problem.SpaceID.CS_CHR
-
 
-# TODO This function is not used anywhere.
 def parsing_character_generator(tmp_dir, train):
   character_vocab = text_encoder.ByteTextEncoder()
   filename = "parsing_%s" % ("train" if train else "dev")
diff --git a/tensor2tensor/data_generators/wmt_test.py b/tensor2tensor/data_generators/wmt_test.py
index 86b88e5b1..441ceef59 100644
--- a/tensor2tensor/data_generators/wmt_test.py
+++ b/tensor2tensor/data_generators/wmt_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/data_generators/wsj_parsing.py b/tensor2tensor/data_generators/wsj_parsing.py
index 200754e16..4b1dbdd80 100644
--- a/tensor2tensor/data_generators/wsj_parsing.py
+++ b/tensor2tensor/data_generators/wsj_parsing.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py
index eff6a2b14..3f714ce1f 100644
--- a/tensor2tensor/models/__init__.py
+++ b/tensor2tensor/models/__init__.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py
index 752de038e..3b874555f 100644
--- a/tensor2tensor/models/attention_lm.py
+++ b/tensor2tensor/models/attention_lm.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index 2754e8366..4b37050bb 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/bluenet.py b/tensor2tensor/models/bluenet.py
index 95216f43d..3ac477e4b 100644
--- a/tensor2tensor/models/bluenet.py
+++ b/tensor2tensor/models/bluenet.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/bluenet_test.py b/tensor2tensor/models/bluenet_test.py
index b3f18249d..d4ce85b1a 100644
--- a/tensor2tensor/models/bluenet_test.py
+++ b/tensor2tensor/models/bluenet_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py
index 301626dc2..28862e594 100644
--- a/tensor2tensor/models/bytenet.py
+++ b/tensor2tensor/models/bytenet.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py
index f1e42669e..738b84251 100644
--- a/tensor2tensor/models/bytenet_test.py
+++ b/tensor2tensor/models/bytenet_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py
index c8b4a6068..4f694a7f9 100644
--- a/tensor2tensor/models/common_attention.py
+++ b/tensor2tensor/models/common_attention.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -65,9 +66,6 @@ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
       tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
   scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
   signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
-  signal = tf.reshape(signal, [length, 2, num_timescales])
-  signal = tf.transpose(signal, perm=[0, 2, 1])
-  signal = tf.reshape(signal, [length, channels])
   signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
   signal = tf.reshape(signal, [1, length, channels])
   return x + signal
diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py
index ff856968b..a86974d1f 100644
--- a/tensor2tensor/models/common_hparams.py
+++ b/tensor2tensor/models/common_hparams.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
index 638535aa2..11b6396a8 100644
--- a/tensor2tensor/models/common_layers.py
+++ b/tensor2tensor/models/common_layers.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/common_layers_test.py b/tensor2tensor/models/common_layers_test.py
index 3a2fafd8b..8e724587b 100644
--- a/tensor2tensor/models/common_layers_test.py
+++ b/tensor2tensor/models/common_layers_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/long_answer.py b/tensor2tensor/models/long_answer.py
index 7bb6a4a55..be8024f63 100644
--- a/tensor2tensor/models/long_answer.py
+++ b/tensor2tensor/models/long_answer.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py
index c3ae0a01e..ae221bdff 100644
--- a/tensor2tensor/models/lstm.py
+++ b/tensor2tensor/models/lstm.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py
index 4ddaf6b64..1e542a666 100644
--- a/tensor2tensor/models/lstm_test.py
+++ b/tensor2tensor/models/lstm_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py
index 60df80a1c..9a6115558 100644
--- a/tensor2tensor/models/modalities.py
+++ b/tensor2tensor/models/modalities.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/modalities_test.py b/tensor2tensor/models/modalities_test.py
index 118db3847..4254c6b04 100644
--- a/tensor2tensor/models/modalities_test.py
+++ b/tensor2tensor/models/modalities_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py
index 2cf639426..e92ddd3ed 100644
--- a/tensor2tensor/models/models.py
+++ b/tensor2tensor/models/models.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py
index bf06dfd65..089889ce6 100644
--- a/tensor2tensor/models/multimodel.py
+++ b/tensor2tensor/models/multimodel.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py
index 958fac5d7..03990594b 100644
--- a/tensor2tensor/models/multimodel_test.py
+++ b/tensor2tensor/models/multimodel_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py
index 30d535098..fc9d75639 100644
--- a/tensor2tensor/models/neural_gpu.py
+++ b/tensor2tensor/models/neural_gpu.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py
index 1dddc1056..3d1cc0562 100644
--- a/tensor2tensor/models/neural_gpu_test.py
+++ b/tensor2tensor/models/neural_gpu_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/shake_shake.py b/tensor2tensor/models/shake_shake.py
index 26d43afb3..7fa40783a 100644
--- a/tensor2tensor/models/shake_shake.py
+++ b/tensor2tensor/models/shake_shake.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index 2ad4c89d1..69e2338b6 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py
index 911953445..692799571 100644
--- a/tensor2tensor/models/slicenet_test.py
+++ b/tensor2tensor/models/slicenet_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index c693d1ca3..23197fcd9 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/transformer_alternative.py b/tensor2tensor/models/transformer_alternative.py
index 280dbc713..62413c325 100644
--- a/tensor2tensor/models/transformer_alternative.py
+++ b/tensor2tensor/models/transformer_alternative.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
index 997b5d172..a7f1fc9ae 100644
--- a/tensor2tensor/models/transformer_test.py
+++ b/tensor2tensor/models/transformer_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py
index d3c5a2690..61fa61235 100644
--- a/tensor2tensor/models/xception.py
+++ b/tensor2tensor/models/xception.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py
index aa5c1c034..bf434aeac 100644
--- a/tensor2tensor/models/xception_test.py
+++ b/tensor2tensor/models/xception_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/__init__.py b/tensor2tensor/utils/__init__.py
index eff6a2b14..3f714ce1f 100644
--- a/tensor2tensor/utils/__init__.py
+++ b/tensor2tensor/utils/__init__.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/avg_checkpoints.py b/tensor2tensor/utils/avg_checkpoints.py
index a84750310..77acd4353 100644
--- a/tensor2tensor/utils/avg_checkpoints.py
+++ b/tensor2tensor/utils/avg_checkpoints.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
index 3a511907d..dd8275204 100644
--- a/tensor2tensor/utils/beam_search.py
+++ b/tensor2tensor/utils/beam_search.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/beam_search_test.py b/tensor2tensor/utils/beam_search_test.py
index e084f1f0e..5223989ea 100644
--- a/tensor2tensor/utils/beam_search_test.py
+++ b/tensor2tensor/utils/beam_search_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py
index 155b10c72..06d62ad1e 100644
--- a/tensor2tensor/utils/bleu_hook.py
+++ b/tensor2tensor/utils/bleu_hook.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/bleu_hook_test.py b/tensor2tensor/utils/bleu_hook_test.py
index 8092ab979..bf08174f8 100644
--- a/tensor2tensor/utils/bleu_hook_test.py
+++ b/tensor2tensor/utils/bleu_hook_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index cd8e6c2d3..d7af960ab 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py
index 18507ed06..f0c318e7b 100644
--- a/tensor2tensor/utils/data_reader_test.py
+++ b/tensor2tensor/utils/data_reader_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
index c3becbfb4..e21f2453a 100644
--- a/tensor2tensor/utils/expert_utils.py
+++ b/tensor2tensor/utils/expert_utils.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/get_ende_bleu.sh b/tensor2tensor/utils/get_ende_bleu.sh
index 3493af74c..09078414f 100755
--- a/tensor2tensor/utils/get_ende_bleu.sh
+++ b/tensor2tensor/utils/get_ende_bleu.sh
@@ -5,8 +5,10 @@ tok_gold_targets=newstest2013.tok.de
 
 decodes_file=$1
 
+cut -d'	' -f1 $decodes_file > $decodes_file.target
+
 # Tokenize.
-perl $mosesdecoder/scripts/tokenizer/tokenizer.perl -l de < $decodes_file > $decodes_file.tok
+perl $mosesdecoder/scripts/tokenizer/tokenizer.perl -l de < $decodes_file.target > $decodes_file.tok
 
 # Put compounds in ATAT format (comparable to papers like GNMT, ConvS2S).
 # See https://nlp.stanford.edu/projects/nmt/ :
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index cf66f6af8..118e33394 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/metrics_test.py b/tensor2tensor/utils/metrics_test.py
index de72d797f..0d78e632c 100644
--- a/tensor2tensor/utils/metrics_test.py
+++ b/tensor2tensor/utils/metrics_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py
index 3ac6153b7..a42f35c24 100644
--- a/tensor2tensor/utils/modality.py
+++ b/tensor2tensor/utils/modality.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
index 5a8823510..0baad2471 100644
--- a/tensor2tensor/utils/registry.py
+++ b/tensor2tensor/utils/registry.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/registry_test.py b/tensor2tensor/utils/registry_test.py
index 1f4436b0c..3231809ea 100644
--- a/tensor2tensor/utils/registry_test.py
+++ b/tensor2tensor/utils/registry_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 2a271afbf..9777568fc 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 878dbe107..96c43a5a0 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -121,6 +122,8 @@
                   "Whether to return 1 (False) or all (True) beams. The \n "
                   "output file will have the format "
                   "<beam1>\t<beam2>..\t<input>")
+flags.DEFINE_integer("decode_max_input_size", -1,
+                     "Maximum number of ids in input. Or <= 0 for no max.")
 
 
 def _save_until_eos(hyp):
@@ -693,17 +696,22 @@ def log_fn(inputs, outputs):
   decodes.reverse()
   # Dumping inputs and outputs to file filename.decodes in
   # format result\tinput in the same order as original inputs
+  if FLAGS.decode_to_file:
+    output_filename = FLAGS.decode_to_file
+  else:
+    output_filename = filename
   if FLAGS.decode_shards > 1:
-    base_filename = filename + ("%.2d" % FLAGS.worker_id)
+    base_filename = output_filename + ("%.2d" % FLAGS.worker_id)
   else:
-    base_filename = filename
+    base_filename = output_filename
   decode_filename = (base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set
                      + ".beam" + str(FLAGS.decode_beam_size) + ".alpha" +
                      str(FLAGS.decode_alpha) + ".decodes")
   tf.logging.info("Writing decodes into %s" % decode_filename)
   outfile = tf.gfile.Open(decode_filename, "w")
   for index in range(len(sorted_inputs)):
-    outfile.write("%s\n" % (decodes[sorted_keys[index]]))
+    outfile.write("%s\t%s\n" % (decodes[sorted_keys[index]],
+                                sorted_inputs[sorted_keys[index]]))
 
 
 def decode_interactively(estimator):
@@ -744,6 +752,9 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
     for inputs in sorted_inputs[b * FLAGS.decode_batch_size:
                                 (b + 1) * FLAGS.decode_batch_size]:
       input_ids = vocabulary.encode(inputs)
+      if FLAGS.decode_max_input_size > 0:
+        # Subtract 1 for the EOS_ID.
+        input_ids = input_ids[:FLAGS.decode_max_input_size - 1]
       input_ids.append(text_encoder.EOS_ID)
       batch_inputs.append(input_ids)
       if len(input_ids) > batch_length:
diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py
index 3ed86952b..ea88183c9 100644
--- a/tensor2tensor/utils/trainer_utils_test.py
+++ b/tensor2tensor/utils/trainer_utils_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/usr_dir.py b/tensor2tensor/utils/usr_dir.py
index 0a2d0d15c..d89745b98 100644
--- a/tensor2tensor/utils/usr_dir.py
+++ b/tensor2tensor/utils/usr_dir.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/yellowfin.py b/tensor2tensor/utils/yellowfin.py
index 6bbe31bf6..aeb14e76e 100644
--- a/tensor2tensor/utils/yellowfin.py
+++ b/tensor2tensor/utils/yellowfin.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensor2tensor/utils/yellowfin_test.py b/tensor2tensor/utils/yellowfin_test.py
index c4727175b..2130be2b3 100644
--- a/tensor2tensor/utils/yellowfin_test.py
+++ b/tensor2tensor/utils/yellowfin_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

From 8f624dbda8d78d0331b5cc7465cc1f39bf259de1 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Fri, 21 Jul 2017 15:15:04 -0700
Subject: [PATCH 02/21] Don't repeatedly concatenate strings in a loop.

PiperOrigin-RevId: 162791277
---
 tensor2tensor/data_generators/tokenizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
index d1faaa7b3..0f4141199 100644
--- a/tensor2tensor/data_generators/tokenizer.py
+++ b/tensor2tensor/data_generators/tokenizer.py
@@ -101,13 +101,13 @@ def decode(tokens):
   Returns:
     a unicode string
   """
-  ret = u""
   token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
+  ret = []
   for i, token in enumerate(tokens):
     if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
-      ret += u" "
-    ret += token
-  return ret
+      ret.append(u" ")
+    ret.append(token)
+  return "".join(ret)
 
 
 def corpus_token_counts(text_filepattern, corpus_max_lines,

From e43ce968f9ce9f06dc5bb83cc0bb57af848fe3ac Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Mon, 24 Jul 2017 09:27:21 -0700
Subject: [PATCH 03/21] Set `allow_defun` to False, allowing export to
 tf.SavedModel

PiperOrigin-RevId: 162946551
---
 tensor2tensor/models/common_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
index 11b6396a8..37e791bc3 100644
--- a/tensor2tensor/models/common_layers.py
+++ b/tensor2tensor/models/common_layers.py
@@ -31,7 +31,7 @@
 from tensorflow.python.framework import function
 
 # This is a global setting. When turned off, no @function.Defun is used.
-allow_defun = True
+allow_defun = False
 
 
 def saturating_sigmoid(x):

From c422b989ba9963b2900b53aa5d8de8d5505ddc01 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Mon, 24 Jul 2017 09:27:44 -0700
Subject: [PATCH 04/21] Add task_id to Problem for possibly distributed data
 gen

PiperOrigin-RevId: 162946584
---
 tensor2tensor/bin/t2t-datagen            | 5 ++++-
 tensor2tensor/data_generators/problem.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index 57e2b17fb..ecb5175e6 100644
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -67,6 +67,7 @@ flags.DEFINE_integer("num_shards", 10, "How many shards to use.")
 flags.DEFINE_integer("max_cases", 0,
                      "Maximum number of cases to generate (unbounded if 0).")
 flags.DEFINE_integer("random_seed", 429459, "Random seed to use.")
+flags.DEFINE_integer("task_id", -1, "For distributed data generation.")
 flags.DEFINE_string("t2t_usr_dir", "",
                     "Path to a Python module that will be imported. The "
                     "__init__.py file should include the necessary imports. "
@@ -277,9 +278,11 @@ def generate_data_for_problem(problem):
 
 def generate_data_for_registered_problem(problem_name):
   problem = registry.problem(problem_name)
+  task_id = None if FLAGS.task_id < 0 else FLAGS.task_id
   problem.generate_data(os.path.expanduser(FLAGS.data_dir),
                         os.path.expanduser(FLAGS.tmp_dir),
-                        FLAGS.num_shards)
+                        FLAGS.num_shards,
+                        task_id=task_id)
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 690f14277..99f8e97de 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -115,7 +115,7 @@ class Problem(object):
   # BEGIN SUBCLASS INTERFACE
   # ============================================================================
 
-  def generate_data(self, data_dir, tmp_dir, num_shards=None):
+  def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
     raise NotImplementedError()
 
   def hparams(self, defaults, model_hparams):

From 7a3c35dabaedbca620e5b7915903076ae93e03a7 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Mon, 24 Jul 2017 18:03:03 -0700
Subject: [PATCH 05/21] GeneExpression Problem, RealModality, and
 Problem.preprocessing

PiperOrigin-RevId: 163016460
---
 tensor2tensor/bin/t2t-datagen                 |   2 +-
 tensor2tensor/data_generators/algorithmic.py  |   2 +-
 tensor2tensor/data_generators/genetics.py     | 171 +++++++++++++++---
 .../data_generators/genetics_test.py          |  19 +-
 tensor2tensor/data_generators/image.py        |  32 +++-
 tensor2tensor/data_generators/problem.py      |  27 +++
 tensor2tensor/data_generators/wmt.py          |   2 +-
 tensor2tensor/models/common_hparams.py        |   2 +
 tensor2tensor/models/modalities.py            |  38 +++-
 tensor2tensor/models/transformer.py           |   4 +-
 tensor2tensor/utils/data_reader.py            |  76 ++++++--
 tensor2tensor/utils/modality.py               |   2 +-
 tensor2tensor/utils/trainer_utils.py          |  72 +++++---
 13 files changed, 363 insertions(+), 86 deletions(-)

diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index ecb5175e6..783906d95 100644
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -281,7 +281,7 @@ def generate_data_for_registered_problem(problem_name):
   task_id = None if FLAGS.task_id < 0 else FLAGS.task_id
   problem.generate_data(os.path.expanduser(FLAGS.data_dir),
                         os.path.expanduser(FLAGS.tmp_dir),
-                        FLAGS.num_shards,
+                        num_shards=FLAGS.num_shards,
                         task_id=task_id)
 
 
diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py
index 676b4e45f..017bc8470 100644
--- a/tensor2tensor/data_generators/algorithmic.py
+++ b/tensor2tensor/data_generators/algorithmic.py
@@ -66,7 +66,7 @@ def dev_size(self):
   def num_shards(self):
     return 10
 
-  def generate_data(self, data_dir, _, num_shards=None):
+  def generate_data(self, data_dir, _, num_shards=None, task_id=-1):
     if num_shards is None:
       num_shards = self.num_shards
 
diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py
index b4ad36544..848c2341b 100644
--- a/tensor2tensor/data_generators/genetics.py
+++ b/tensor2tensor/data_generators/genetics.py
@@ -35,6 +35,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import itertools
 import multiprocessing as mp
 import os
 
@@ -50,19 +51,13 @@
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.utils import registry
 
-_bases = list("ACTG")
-BASE_TO_ID = dict(zip(_bases, range(len(_bases))))
-ID_TO_BASE = dict(zip(range(len(_bases)), _bases))
-UNK_ID = len(_bases)
-
+import tensorflow as tf
 
-# TODO(rsepassi):
-# * DataEncoder for genetic bases
-# * GeneticModality and problem hparams
-# * Training preprocessing
+_bases = list("ACTG")
 
 
-class GeneticsProblem(problem.Problem):
+class GeneExpressionProblem(problem.Problem):
+  """Base Problem for gene expression datasets."""
 
   @property
   def download_url(self):
@@ -72,13 +67,35 @@ def download_url(self):
   def h5_file(self):
     raise NotImplementedError()
 
-  def generate_data(self, data_dir, tmp_dir, num_shards=None):
+  @property
+  def num_output_predictions(self):
+    """Number of float predictions per timestep."""
+    return 10
+
+  @property
+  def chunk_size(self):
+    return 4
+
+  def feature_encoders(self, data_dir):
+    del data_dir
+    return {
+        "inputs": GeneticBaseEncoder(chunk_size=self.chunk_size),
+        # TODO(rsepassi): RealEncoder?
+        "targets": text_encoder.TextEncoder()
+    }
+
+  def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
     if num_shards is None:
       num_shards = 100
 
-    # Download source data
-    h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file,
-                                                 self.download_url)
+    try:
+      # Download source data if download_url specified
+      h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file,
+                                                   self.download_url)
+    except NotImplementedError:
+      # Otherwise, look for it locally
+      h5_filepath = os.path.join(tmp_dir, self.h5_file)
+
     with h5py.File(h5_filepath, "r") as h5_file:
       num_train_examples = h5_file["train_in"].len()
       num_dev_examples = h5_file["valid_in"].len()
@@ -100,7 +117,8 @@ def generate_data(self, data_dir, tmp_dir, num_shards=None):
           outfiles, num_examples):
         p = mp.Process(
             target=generate_dataset,
-            args=(h5_filepath, key_prefix, [outfile], start_idx, end_idx))
+            args=(h5_filepath, key_prefix, [outfile], self.chunk_size,
+                  start_idx, end_idx))
         processes.append(p)
 
     # Start and wait for processes
@@ -113,9 +131,36 @@ def generate_data(self, data_dir, tmp_dir, num_shards=None):
     # Shuffle
     generator_utils.shuffle_dataset(all_filepaths)
 
+  def hparams(self, defaults, model_hparams):
+    p = defaults
+    vocab_size = self._encoders["inputs"].vocab_size
+    p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)}
+    p.target_modality = ("%s:real" % registry.Modalities.GENERIC,
+                         self.num_output_predictions)
+    p.input_space_id = problem.SpaceID.DNA
+    p.target_space_id = problem.SpaceID.REAL
+
+  def example_reading_spec(self):
+    # TODO(rsepassi): propagate and apply targets_mask to output RealModality.
+    data_fields = {
+        "inputs": tf.VarLenFeature(tf.int64),
+        "targets_mask": tf.VarLenFeature(tf.float32),
+        "targets": tf.VarLenFeature(tf.float32),
+    }
+    data_items_to_decoders = None
+    return (data_fields, data_items_to_decoders)
+
+  def preprocess_examples(self, examples, mode):
+    del mode
+
+    examples["targets"] = tf.reshape(examples["targets"],
+                                     [-1, 1, self.num_output_predictions])
+
+    return examples
+
 
 @registry.register_problem("genetics_cage10")
-class GeneticsCAGE10(GeneticsProblem):
+class GeneticsCAGE10(GeneExpressionProblem):
 
   @property
   def download_url(self):
@@ -127,7 +172,7 @@ def h5_file(self):
 
 
 @registry.register_problem("genetics_gm12878")
-class GeneticsGM12878(GeneticsProblem):
+class GeneticsGM12878(GeneExpressionProblem):
 
   @property
   def download_url(self):
@@ -138,6 +183,14 @@ def h5_file(self):
     return "gm12878.h5"
 
 
+@registry.register_problem("genetics_l262k")
+class GeneticsL262k(GeneExpressionProblem):
+
+  @property
+  def h5_file(self):
+    return "l262k_w128.h5"
+
+
 def generate_shard_args(outfiles, num_examples):
   """Generate start and end indices per outfile."""
   num_shards = len(outfiles)
@@ -152,16 +205,22 @@ def generate_shard_args(outfiles, num_examples):
 def generate_dataset(h5_filepath,
                      key_prefix,
                      out_filepaths,
+                     chunk_size=1,
                      start_idx=None,
                      end_idx=None):
   print("PID: %d, Key: %s, (Start, End): (%s, %s)" % (os.getpid(), key_prefix,
                                                       start_idx, end_idx))
   generator_utils.generate_files(
-      dataset_generator(h5_filepath, key_prefix, start_idx, end_idx),
-      out_filepaths)
+      dataset_generator(h5_filepath, key_prefix, chunk_size, start_idx,
+                        end_idx), out_filepaths)
 
 
-def dataset_generator(filepath, dataset, start_idx=None, end_idx=None):
+def dataset_generator(filepath,
+                      dataset,
+                      chunk_size=1,
+                      start_idx=None,
+                      end_idx=None):
+  encoder = GeneticBaseEncoder(chunk_size=chunk_size)
   with h5py.File(filepath, "r") as h5_file:
     # Get input keys from h5_file
     src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]]
@@ -178,12 +237,13 @@ def dataset_generator(filepath, dataset, start_idx=None, end_idx=None):
       if i % 100 == 0:
         print("Generating example %d for %s" % (i, dataset))
       inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i]
-      yield to_example_dict(inputs, mask, outputs)
+      yield to_example_dict(encoder, inputs, mask, outputs)
 
 
-def to_example_dict(inputs, mask, outputs):
+def to_example_dict(encoder, inputs, mask, outputs):
   """Convert single h5 record to an example dict."""
   # Inputs
+  bases = []
   input_ids = []
   last_idx = -1
   for row in np.argwhere(inputs):
@@ -192,11 +252,13 @@ def to_example_dict(inputs, mask, outputs):
     assert idx > last_idx  # if not, means 2 True values in 1 row
     # Some rows are all False. Those rows are mapped to UNK_ID.
     while idx != last_idx + 1:
-      input_ids.append(UNK_ID + text_encoder.NUM_RESERVED_TOKENS)
+      bases.append(encoder.UNK)
       last_idx += 1
-    input_ids.append(base_id + text_encoder.NUM_RESERVED_TOKENS)
+    bases.append(_bases[base_id])
     last_idx = idx
-  assert len(inputs) == len(input_ids)
+  assert len(inputs) == len(bases)
+
+  input_ids = encoder.encode(bases)
   input_ids.append(text_encoder.EOS_ID)
 
   # Targets: mask and output
@@ -211,3 +273,62 @@ def to_example_dict(inputs, mask, outputs):
   ex_dict = dict(
       zip(example_keys, [input_ids, targets_mask, targets, targets_shape]))
   return ex_dict
+
+
+class GeneticBaseEncoder(text_encoder.TextEncoder):
+  """ACTG strings to ints and back. Optionally chunks bases into single ids.
+
+  Uses 'X' as an unknown base.
+  """
+  UNK = "X"
+  PAD = "0"
+
+  def __init__(self,
+               chunk_size=1,
+               num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS):
+    super(GeneticBaseEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
+    # Build a vocabulary of chunks of size chunk_size
+    self._chunk_size = chunk_size
+    chunks = []
+    for size in range(1, chunk_size + 1):
+      c = itertools.product(_bases + [GeneticBaseEncoder.UNK], repeat=size)
+      num_pad = chunk_size - size
+      padding = (GeneticBaseEncoder.PAD,) * num_pad
+      c = [el + padding for el in c]
+      chunks.extend(c)
+    chunks.sort()
+    ids = range(self._num_reserved_ids, len(chunks) + self._num_reserved_ids)
+    self._ids_to_chunk = dict(zip(ids, chunks))
+    self._chunks_to_ids = dict(zip(chunks, ids))
+
+  @property
+  def vocab_size(self):
+    return len(self._ids_to_chunk) + self._num_reserved_ids
+
+  def encode(self, s):
+    bases = list(s)
+    pad = [GeneticBaseEncoder.PAD] * (len(bases) % self._chunk_size)
+    bases.extend(pad)
+    assert (len(bases) % self._chunk_size) == 0
+    num_chunks = len(bases) // self._chunk_size
+    ids = []
+    for chunk_idx in xrange(num_chunks):
+      start_idx = chunk_idx * self._chunk_size
+      end_idx = start_idx + self._chunk_size
+      chunk = tuple(bases[start_idx:end_idx])
+      if chunk not in self._chunks_to_ids:
+        raise ValueError("Unrecognized chunk %s" % chunk)
+      ids.append(self._chunks_to_ids[chunk])
+    return ids
+
+  def decode(self, ids):
+    bases = []
+    for idx in ids:
+      if idx >= self._num_reserved_ids:
+        chunk = self._ids_to_chunk[idx]
+        if GeneticBaseEncoder.PAD in chunk:
+          chunk = chunk[:chunk.index(GeneticBaseEncoder.PAD)]
+      else:
+        chunk = [text_encoder.RESERVED_TOKENS[idx]]
+      bases.extend(chunk)
+    return "".join(bases)
diff --git a/tensor2tensor/data_generators/genetics_test.py b/tensor2tensor/data_generators/genetics_test.py
index 85d70f934..5eac1b249 100644
--- a/tensor2tensor/data_generators/genetics_test.py
+++ b/tensor2tensor/data_generators/genetics_test.py
@@ -30,21 +30,28 @@
 class GeneticsTest(tf.test.TestCase):
 
   def _oneHotBases(self, bases):
+    ref = ["A", "C", "T", "G"]
     one_hots = []
-    for base_id in bases:
+    for base in bases:
       one_hot = [False] * 4
-      if base_id < 4:
-        one_hot[base_id] = True
+      if base in ref:
+        one_hot[ref.index(base)] = True
       one_hots.append(one_hot)
     return np.array(one_hots)
 
   def testRecordToExample(self):
-    inputs = self._oneHotBases([0, 1, 3, 4, 1, 0])
+    encoder = genetics.GeneticBaseEncoder(chunk_size=2)
+    raw_inputs = ["A", "C", "G", "X", "C", "T"]
+
+    # Put in numpy arrays in the same format as in the h5 file
+    inputs = self._oneHotBases(raw_inputs)
     mask = np.array([True, False, True])
     outputs = np.array([[1.0, 2.0, 3.0], [5.0, 1.0, 0.2], [5.1, 2.3, 2.3]])
-    ex_dict = genetics.to_example_dict(inputs, mask, outputs)
+    # Convert to example dict
+    ex_dict = genetics.to_example_dict(encoder, inputs, mask, outputs)
 
-    self.assertAllEqual([2, 3, 5, 6, 3, 2, 1], ex_dict["inputs"])
+    self.assertEqual(len(raw_inputs) // 2 + 1, len(ex_dict["inputs"]))
+    self.assertAllEqual(encoder.encode(raw_inputs) + [1], ex_dict["inputs"])
     self.assertAllEqual([1.0, 0.0, 1.0], ex_dict["targets_mask"])
     self.assertAllEqual([1.0, 2.0, 3.0, 5.0, 1.0, 0.2, 5.1, 2.3, 2.3],
                         ex_dict["targets"])
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
index f8e3191a2..acb1128ed 100644
--- a/tensor2tensor/data_generators/image.py
+++ b/tensor2tensor/data_generators/image.py
@@ -307,14 +307,38 @@ def mscoco_generator(data_dir,
             "image/width": [width]
         }
 
+
+class ImageProblem(problem.Problem):
+
+  def example_reading_spec(self, label_key=None):
+    if label_key is None:
+      label_key = "image/class/label"
+
+    data_fields = {
+        "image/encoded": tf.FixedLenFeature((), tf.string),
+        "image/format": tf.FixedLenFeature((), tf.string),
+        label_key: tf.VarLenFeature(tf.int64)
+    }
+    data_items_to_decoders = {
+        "inputs":
+            tf.contrib.slim.tfexample_decoder.Image(
+                image_key="image/encoded",
+                format_key="image/format",
+                channels=3),
+        "targets":
+            tf.contrib.slim.tfexample_decoder.Tensor(label_key),
+    }
+
+    return data_fields, data_items_to_decoders
+
 # French street names dataset.
 
 
 @registry.register_problem
-class ImageFSNS(problem.Problem):
+class ImageFSNS(ImageProblem):
   """Problem spec for French Street Name recognition."""
 
-  def generate_data(self, data_dir, tmp_dir):
+  def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
     list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/"
                 "street/python/fsns_urls.txt")
     fsns_urls = generator_utils.maybe_download(
@@ -351,6 +375,10 @@ def hparams(self, defaults, model_hparams):
     p.input_space_id = problem.SpaceID.DIGIT_0
     p.target_space_id = problem.SpaceID.DIGIT_1
 
+  def example_reading_spec(self):
+    label_key = "image/unpadded_label"
+    return super(ImageFSNS, self).example_reading_spec(self,
+                                                       label_key=label_key)
 
 # Filename for CELEBA data.
 _CELEBA_NAME = "img_align_celeba"
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 99f8e97de..02e198c03 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -70,6 +70,10 @@ class SpaceID(object):
   ICE_PARSE_TOK = 19
   # Macedonian tokens
   MK_TOK = 20
+  # Genetic bases (ACTG)
+  DNA = 21
+  # Real numbers
+  REAL = 22
 
 
 class Problem(object):
@@ -131,6 +135,18 @@ def feature_encoders(self, data_dir):
         "targets": text_encoder.TextEncoder()
     }
 
+  def example_reading_spec(self):
+    data_fields = {
+        "inputs": tf.VarLenFeature(tf.int64),
+        "targets": tf.VarLenFeature(tf.int64)
+    }
+    data_items_to_decoders = None
+    return (data_fields, data_items_to_decoders)
+
+  def preprocess_examples(self, examples, mode):
+    del mode
+    return examples
+
   # ============================================================================
   # END SUBCLASS INTERFACE
   # ============================================================================
@@ -193,6 +209,17 @@ def internal_hparams(self, model_hparams):
       _copy_problem_hparams(hp)
     return hp
 
+  def maybe_reverse_features(self, feature_map):
+    if not self._was_reversed:
+      return
+    inputs, targets = feature_map["inputs"], feature_map["targets"]
+    feature_map["inputs"], feature_map["targets"] = targets, inputs
+
+  def maybe_copy_features(self, feature_map):
+    if not self._was_copy:
+      return
+    feature_map["targets"] = feature_map["inputs"]
+
 
 def _copy_problem_hparams(p_hparams):
   """Use input modality, vocab, and space id for target."""
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index bb31d0c0f..3fc74473a 100644
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -92,7 +92,7 @@ def target_space_id(self):
   def num_shards(self):
     return 100
 
-  def generate_data(self, data_dir, tmp_dir, num_shards=None):
+  def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
     if num_shards is None:
       num_shards = self.num_shards
     if self.is_character_level:
diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py
index a86974d1f..e36b2e4e1 100644
--- a/tensor2tensor/models/common_hparams.py
+++ b/tensor2tensor/models/common_hparams.py
@@ -50,6 +50,8 @@ def basic_params1():
       # when not in training mode.
       dropout=0.2,
       clip_grad_norm=2.0,
+      grad_noise_scale=0.0,
+      summarize_grads=int(False),
       initializer="orthogonal",
       initializer_gain=1.5,
       label_smoothing=0.1,
diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py
index 9a6115558..50a3da55d 100644
--- a/tensor2tensor/models/modalities.py
+++ b/tensor2tensor/models/modalities.py
@@ -181,12 +181,11 @@ def top(self, body_output, _):
         shape = tf.shape(body_output_split[i])[:-1]
         body_output = tf.reshape(body_output_split[i],
                                  [-1, self._body_input_depth])
-        channel_logits = tf.matmul(body_output,
-                                   output_rgb_embedding_var[i],
-                                   transpose_b=True)
-        rgb_channel_logits.append(tf.reshape(
-            channel_logits, tf.concat([shape, [self.top_dimensionality]],
-                                      0)))
+        channel_logits = tf.matmul(
+            body_output, output_rgb_embedding_var[i], transpose_b=True)
+        rgb_channel_logits.append(
+            tf.reshape(channel_logits,
+                       tf.concat([shape, [self.top_dimensionality]], 0)))
 
       logits = tf.concat(rgb_channel_logits, axis=3)
       # Reshape logits to conform to CIFAR image shapes (32 by 32 by 3)
@@ -468,6 +467,33 @@ def top(self, body_output, _):
     return body_output
 
 
+@registry.register_generic_modality("real")
+class RealModality(modality.Modality):
+  """Modality for real (i.e. float) vectors."""
+
+  def bottom(self, x):
+    with tf.variable_scope("real"):
+      return tf.layers.dense(x, self._body_input_depth)
+
+  def top(self, body_output, _):
+    with tf.variable_scope("real"):
+      return tf.layers.dense(body_output, self._vocab_size)
+
+  def top_sharded(self,
+                  sharded_body_output,
+                  sharded_targets,
+                  data_parallelism,
+                  weights_fn=common_layers.weights_nonzero):
+    sharded_predictions = data_parallelism(self.top, sharded_body_output,
+                                           sharded_targets)
+
+    def l2_loss(predictions, targets):
+      return tf.reduce_mean(tf.pow(predictions - targets, 2))
+
+    loss = data_parallelism(l2_loss, sharded_predictions, sharded_targets)
+    return sharded_predictions, tf.add_n(loss)
+
+
 @registry.register_image_modality("identity_no_pad")
 class IdentityModalityNoPad(modality.Modality):
   """Does nothing except making sure that there is no padding in cross-ent."""
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 23197fcd9..c45e88577 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -46,8 +46,8 @@ def model_fn_body(self, features):
     # Remove dropout if not training
     hparams = copy.copy(self._hparams)
     targets = features["targets"]
-    inputs = features.get("inputs")
-    target_space = features.get("target_space_id")
+    inputs = features["inputs"]
+    target_space = features["target_space_id"]
 
     inputs = common_layers.flatten4d3d(inputs)
     targets = common_layers.flatten4d3d(targets)
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index d7af960ab..24dd31485 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -138,10 +138,12 @@ def preprocessing(examples, data_file_pattern, mode):
     # Small single-example pre-processing for images.
     def resize(img, size):
       return tf.to_int64(tf.image.resize_images(img, [size, size]))
+
     def preprocess(img):
       img = tf.image.resize_images(img, [360, 360])
       img = common_layers.image_augmentation(tf.to_float(img) / 255.)
       return tf.to_int64(img * 255.)
+
     if ("image_imagenet" in data_file_pattern or
         "image_mscoco" in data_file_pattern):
       examples["inputs"] = tf.cast(examples["inputs"], tf.int64)
@@ -154,8 +156,8 @@ def preprocess(img):
             lambda img=inputs: resize(img, 299))
       else:
         examples["inputs"] = tf.to_int64(resize(inputs, 299))
-    elif ("image_cifar10" in data_file_pattern
-          and mode == tf.contrib.learn.ModeKeys.TRAIN):
+    elif ("image_cifar10" in data_file_pattern and
+          mode == tf.contrib.learn.ModeKeys.TRAIN):
       examples["inputs"] = common_layers.cifar_image_augmentation(
           examples["inputs"])
     elif "img2img" in data_file_pattern:
@@ -182,8 +184,62 @@ def preprocess(img):
   return examples
 
 
-def input_pipeline(data_file_pattern, capacity, mode):
+def problem_input_pipeline(problem, data_file_pattern, capacity, mode):
+  """Input pipeline for Problems."""
+  data_fields, data_items_to_decoders = problem.example_reading_spec()
+
+  # Create placeholders for input, rather than reading data from disk.
+  if data_file_pattern is None:
+    return feature_placeholders(data_fields)
+
+  # Now the non-trivial case construction.
+  examples = examples_queue(
+      [data_file_pattern],
+      data_fields,
+      training=(mode == tf.contrib.learn.ModeKeys.TRAIN),
+      capacity=capacity,
+      data_items_to_decoders=data_items_to_decoders)
+
+  examples = problem.preprocess_examples(examples, mode)
+
+  # We do not want int64s as they are not supported on GPUs.
+  examples = cast_int64_to_int32(examples)
+
+  return examples
+
+
+def cast_int64_to_int32(features):
+  f = {}
+  for k, v in six.iteritems(features):
+    if v.dtype == tf.int64:
+      v = tf.to_int32(v)
+    f[k] = v
+  return f
+
+
+def feature_placeholders(data_fields):
+  feature_map = {}
+  for (field, tp) in data_fields:
+    if not field.startswith("targets"):
+      feature_map[field] = tf.placeholder(
+          dtype=tp, shape=[None] * 4, name=field)
+  return feature_map
+
+
+def input_pipeline(problem, data_file_pattern, capacity, mode):
   """Input pipeline, returns a dictionary of tensors from queues."""
+
+  if problem is not None:
+    # problem is not None when the problem is specified with the Problem API,
+    # which handles Example decoding and preprocessing.
+    # Otherwise the problem is specified in problem_hparams and is dealt with
+    # below.
+    # As problems are ported to the Problem API, the special handling here will
+    # need to be moved to Problem.example_reading_spec and
+    # Problem.preprocessing.
+    return problem_input_pipeline(problem, data_file_pattern, capacity, mode)
+
+  data_items_to_decoders = None
   # Read from image TFRecords if the file has "image" in its name.
   if data_file_pattern and "image" in data_file_pattern:
     label_key = "image/class/label"
@@ -211,22 +267,15 @@ def input_pipeline(data_file_pattern, capacity, mode):
         "audio/sample_width": tf.FixedLenFeature((), tf.int64),
         "targets": tf.VarLenFeature(tf.int64),
     }
-    data_items_to_decoders = None
   else:
     data_fields = {
         "inputs": tf.VarLenFeature(tf.int64),
         "targets": tf.VarLenFeature(tf.int64)
     }
-    data_items_to_decoders = None
 
   # Create placeholders for input, rather than reading data from disk.
   if data_file_pattern is None:
-    feature_map = {}
-    for (field, tp) in data_fields:
-      if field != "targets":
-        feature_map[field] = tf.placeholder(
-            dtype=tp, shape=[None] * 4, name=field)
-    return feature_map
+    return feature_placeholders(data_fields)
 
   # Now the non-trivial case construction.
   examples = examples_queue(
@@ -238,8 +287,9 @@ def input_pipeline(data_file_pattern, capacity, mode):
 
   examples = preprocessing(examples, data_file_pattern, mode)
 
-  # We do not want int64s as they do are not supported on GPUs.
-  return {k: tf.to_int32(v) for (k, v) in six.iteritems(examples)}
+  # We do not want int64s as they are not supported on GPUs.
+  examples = cast_int64_to_int32(examples)
+  return examples
 
 
 def batch_examples(examples, batching_scheme):
diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py
index a42f35c24..72169be1f 100644
--- a/tensor2tensor/utils/modality.py
+++ b/tensor2tensor/utils/modality.py
@@ -43,7 +43,7 @@ class Modality(object):
   function targets_bottom represents the auto-regressive part of the network.
   It is applied to the already-generated part of an image, which is given to
   the decoder to generate the next part. In some cases, e.g., for text, it is
-  the same as the inputs_bottom function, as that is the default we use. But,
+  the same as the inputs_bottom function, and that is the default we use. But,
   e.g., for images, a different function might be needed to regress properly.
 
   All 3 functions have simple and sharded versions. A sub-class only needs
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 96c43a5a0..c4bdcf942 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -229,13 +229,16 @@ def create_hparams(params_id, data_dir):
 
   # Add hparams for the problems
   hparams.problems = []
+  hparams.problem_instances = []
   for problem_name in FLAGS.problems.split("-"):
     try:
       problem = registry.problem(problem_name)
       p_hparams = problem.internal_hparams(hparams)
     except ValueError:
+      problem = None
       p_hparams = problem_hparams.problem_hparams(problem_name, hparams)
 
+    hparams.problem_instances.append(problem)
     hparams.problems.append(p_hparams)
 
   return hparams
@@ -304,9 +307,10 @@ def session_config():
   gpu_options = tf.GPUOptions(
       per_process_gpu_memory_fraction=FLAGS.worker_gpu_memory_fraction)
 
-  config = tf.ConfigProto(allow_soft_placement=True,
-                          graph_options=graph_options,
-                          gpu_options=gpu_options)
+  config = tf.ConfigProto(
+      allow_soft_placement=True,
+      graph_options=graph_options,
+      gpu_options=gpu_options)
   return config
 
 
@@ -422,8 +426,12 @@ def model_fn(features, targets, mode):
     def nth_model(n):
       """Build the model for the n-th problem, plus some added variables."""
       model_class = registry.model(model)(
-          hparams, mode, hparams.problems[n],
-          n, dp, _ps_devices(all_workers=True))
+          hparams,
+          mode,
+          hparams.problems[n],
+          n,
+          dp,
+          _ps_devices(all_workers=True))
       if mode == tf.contrib.learn.ModeKeys.INFER:
         return model_class.infer(
             features,
@@ -485,8 +493,8 @@ def nth_model(n):
     if mode == tf.contrib.learn.ModeKeys.EVAL:
       logits = tf.concat(sharded_logits, 0)
       if FLAGS.eval_print:
-        logits = tf.Print(logits, [features["inputs"], logits],
-                          "EVAL PRINT", summarize=10000)
+        logits = tf.Print(
+            logits, [features["inputs"], logits], "EVAL PRINT", summarize=10000)
       # For evaluation, return the logits layer as our predictions.
       run_info["predictions"] = logits
       train_op = None
@@ -544,19 +552,24 @@ def nth_model(n):
     # Define the train_op for the TRAIN mode.
     opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams)
     tf.logging.info("Computing gradients for global model_fn.")
+    opt_summaries = ["learning_rate", "loss", "global_gradient_norm"]
+    if hparams.summarize_grads:
+      opt_summaries.extend(["gradients", "gradient_norm"])
     train_op = tf.contrib.layers.optimize_loss(
         name="training",
         loss=total_loss,
         global_step=tf.contrib.framework.get_global_step(),
         learning_rate=learning_rate,
         clip_gradients=hparams.clip_grad_norm or None,
+        gradient_noise_scale=hparams.grad_noise_scale or None,
         optimizer=opt,
+        summaries=opt_summaries,
         colocate_gradients_with_ops=True)
 
     # Remove summaries that will fail to run because they are in conditionals.
     # TODO(cwhipkey): Test with this code removed, later in 2017.
     summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
-    for i in range(len(summaries)-1, -1, -1):
+    for i in range(len(summaries) - 1, -1, -1):
       if summaries[i].name.startswith("cond_"):
         del summaries[i]
 
@@ -602,8 +615,7 @@ def decode_from_dataset(estimator):
         data_file_patterns=infer_problems_data,
         num_datashards=data_parallelism().n,
         fixed_problem=i)
-    result_iter = estimator.predict(
-        input_fn=infer_input_fn, as_iterable=False)
+    result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False)
 
     def log_fn(inputs,
                targets,
@@ -735,8 +747,8 @@ def decode_interactively(estimator):
           else:
             tf.logging.info(beam_string)
       else:
-        tf.logging.info(targets_vocab.decode(_save_until_eos(
-            result["outputs"].flatten())))
+        tf.logging.info(
+            targets_vocab.decode(_save_until_eos(result["outputs"].flatten())))
 
 
 def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
@@ -749,8 +761,8 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
     tf.logging.info("Decoding batch %d" % b)
     batch_length = 0
     batch_inputs = []
-    for inputs in sorted_inputs[b * FLAGS.decode_batch_size:
-                                (b + 1) * FLAGS.decode_batch_size]:
+    for inputs in sorted_inputs[b * FLAGS.decode_batch_size:(
+        b + 1) * FLAGS.decode_batch_size]:
       input_ids = vocabulary.encode(inputs)
       if FLAGS.decode_max_input_size > 0:
         # Subtract 1 for the EOS_ID.
@@ -1048,12 +1060,13 @@ def input_fn():
       for n in xrange(problem_count):
         if fixed_problem is not None and n != fixed_problem:
           continue
+        problem_instance = hparams.problem_instances[n]
         with tf.name_scope("problem_%d" % n):
           with tf.device("/cpu:0"):  # Input queues are on CPU.
             capacity = hparams.problems[n].max_expected_batch_size_per_shard
             capacity *= num_datashards
-            examples = data_reader.input_pipeline(data_file_patterns[n],
-                                                  capacity, mode)
+            examples = data_reader.input_pipeline(
+                problem_instance, data_file_patterns[n], capacity, mode)
             if mode == tf.contrib.learn.ModeKeys.TRAIN:
               drop_long_sequences = True
             else:
@@ -1068,15 +1081,18 @@ def input_fn():
                     length_multiplier=batch_size_multiplier))
 
         # Reverse inputs and targets features if the problem was reversed.
-        if hparams.problems[n].was_reversed:
-          inputs = feature_map["inputs"]
-          targets = feature_map["targets"]
-          feature_map["inputs"] = targets
-          feature_map["targets"] = inputs
-
-        # Use the inputs as the targets if the problem is a copy problem.
-        if hparams.problems[n].was_copy:
-          feature_map["targets"] = feature_map["inputs"]
+        if problem_instance is not None:
+          problem_instance.maybe_reverse_features(feature_map)
+          problem_instance.maybe_copy_features(feature_map)
+        else:
+          if hparams.problems[n].was_reversed:
+            inputs = feature_map["inputs"]
+            targets = feature_map["targets"]
+            feature_map["inputs"] = targets
+            feature_map["targets"] = inputs
+          # Use the inputs as the targets if the problem is a copy problem.
+          if hparams.problems[n].was_copy:
+            feature_map["targets"] = feature_map["inputs"]
 
         # Ensure inputs and targets are proper rank.
         while len(feature_map["inputs"].get_shape()) != 4:
@@ -1117,9 +1133,9 @@ def input_fn():
         assert FLAGS.worker_replicas % problem_count == 0
         problem_choice = tf.to_int32(FLAGS.worker_id % problem_count)
       else:
-        raise ValueError("Value of hparams.problem_choice is %s and must be "
-                         "one of [uniform, adaptive, distributed]" %
-                         hparams.problem_choice)
+        raise ValueError(
+            "Value of hparams.problem_choice is %s and must be "
+            "one of [uniform, adaptive, distributed]" % hparams.problem_choice)
 
       # Inputs and targets conditional on problem_choice.
       rand_inputs, rand_target, choice, inp_id, tgt_id = _cond_on_index(

From 315647cdbf6efc78591f3047627ca064c75c31dc Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Mon, 24 Jul 2017 18:51:17 -0700
Subject: [PATCH 06/21] Download newer WMT dev set.

PiperOrigin-RevId: 163020223
---
 tensor2tensor/data_generators/problem.py      |   8 +-
 .../data_generators/problem_hparams.py        |   3 +
 tensor2tensor/data_generators/wmt.py          | 316 ++++++++++--------
 tensor2tensor/models/common_attention.py      |   3 +
 tensor2tensor/utils/get_ende_bleu.sh          |   4 +-
 tensor2tensor/utils/trainer_utils.py          |   3 +-
 6 files changed, 186 insertions(+), 151 deletions(-)

diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 02e198c03..22b6214e6 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -70,10 +70,14 @@ class SpaceID(object):
   ICE_PARSE_TOK = 19
   # Macedonian tokens
   MK_TOK = 20
+  # Czech tokens
+  CS_TOK = 21
+  # Czech characters
+  CS_CHR = 22
   # Genetic bases (ACTG)
-  DNA = 21
+  DNA = 23
   # Real numbers
-  REAL = 22
+  REAL = 24
 
 
 class Problem(object):
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index 4343afd27..159ea6ac9 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -181,6 +181,9 @@ def default_problem_hparams():
       #   17: Icelandic characters
       #   18: Icelandic tokens
       #   19: Icelandic parse tokens
+      #   20: Macedonian tokens
+      #   21: Czech tokens
+      #   22: Czech characters
       # Add more above if needed.
       input_space_id=0,
       target_space_id=0,
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index 3fc74473a..50125ccd1 100644
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -43,23 +43,6 @@
 EOS = text_encoder.EOS_ID
 
 
-def _default_token_feature_encoders(data_dir, target_vocab_size):
-  vocab_filename = os.path.join(data_dir,
-                                "vocab.endefr.%d" % target_vocab_size)
-  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
-  return {
-      "inputs": subtokenizer,
-      "targets": subtokenizer,
-  }
-
-
-def _default_character_feature_encoders():
-  return {
-      "inputs": text_encoder.ByteTextEncoder(),
-      "targets": text_encoder.ByteTextEncoder(),
-  }
-
-
 class WMTProblem(problem.Problem):
   """Base class for WMT problems."""
 
@@ -71,14 +54,13 @@ def is_character_level(self):
   def targeted_vocab_size(self):
     raise NotImplementedError()  # Not needed if self.is_character_level.
 
-  @property
-  def train_generator(self):
-    """Generator; takes data_dir, tmp_dir, is_training, targeted_vocab_size."""
+  def train_generator(self, data_dir, tmp_dir, is_training):
+    """Generator of the training data."""
     raise NotImplementedError()
 
-  @property
-  def dev_generator(self):
-    return self.train_generator
+  def dev_generator(self, data_dir, tmp_dir):
+    """Generator of the development data."""
+    return self.train_generator(data_dir, tmp_dir, False)
 
   @property
   def input_space_id(self):
@@ -92,28 +74,35 @@ def target_space_id(self):
   def num_shards(self):
     return 100
 
+  @property
+  def vocab_name(self):
+    return "vocab.endefr"
+
+  @property
+  def vocab_file(self):
+    return "%s.%d" % (self.vocab_name, self.targeted_vocab_size)
+
   def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
     if num_shards is None:
       num_shards = self.num_shards
-    if self.is_character_level:
-      generator_utils.generate_dataset_and_shuffle(
-          self.train_generator(tmp_dir, True),
-          self.training_filepaths(data_dir, num_shards, shuffled=False),
-          self.dev_generator(tmp_dir, False),
-          self.dev_filepaths(data_dir, 1, shuffled=False))
-    else:
-      generator_utils.generate_dataset_and_shuffle(
-          self.train_generator(data_dir, tmp_dir, True,
-                               self.targeted_vocab_size),
-          self.training_filepaths(data_dir, num_shards, shuffled=False),
-          self.dev_generator(data_dir, tmp_dir, False,
-                             self.targeted_vocab_size),
-          self.dev_filepaths(data_dir, 1, shuffled=False))
+    generator_utils.generate_dataset_and_shuffle(
+        self.train_generator(data_dir, tmp_dir, True),
+        self.training_filepaths(data_dir, num_shards, shuffled=False),
+        self.dev_generator(data_dir, tmp_dir),
+        self.dev_filepaths(data_dir, 1, shuffled=False))
 
   def feature_encoders(self, data_dir):
     if self.is_character_level:
-      return _default_character_feature_encoders()
-    return _default_token_feature_encoders(data_dir, self.targeted_vocab_size)
+      return {
+          "inputs": text_encoder.ByteTextEncoder(),
+          "targets": text_encoder.ByteTextEncoder(),
+      }
+    vocab_filename = os.path.join(data_dir, self.vocab_file)
+    subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
+    return {
+        "inputs": subtokenizer,
+        "targets": subtokenizer,
+    }
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
@@ -175,8 +164,8 @@ def tabbed_generator(source_path, source_vocab, target_vocab, eos=None):
 
   Args:
     source_path: path to the file with source and target sentences.
-    source_vocab: a SunwordTextEncoder to encode the source string.
-    target_vocab: a SunwordTextEncoder to encode the target string.
+    source_vocab: a SubwordTextEncoder to encode the source string.
+    target_vocab: a SubwordTextEncoder to encode the target string.
     eos: integer to append at the end of each sequence (default: None).
 
   Yields:
@@ -262,7 +251,7 @@ def bi_vocabs_token_generator(source_path,
 
 _ENDE_TRAIN_DATASETS = [
     [
-        "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
+        "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
         ("training-parallel-nc-v11/news-commentary-v11.de-en.en",
          "training-parallel-nc-v11/news-commentary-v11.de-en.de")
     ],
@@ -277,7 +266,7 @@ def bi_vocabs_token_generator(source_path,
 ]
 _ENDE_TEST_DATASETS = [
     [
-        "http://data.statmt.org/wmt16/translation-task/dev.tgz",
+        "http://data.statmt.org/wmt17/translation-task/dev.tgz",
         ("dev/newstest2013.en", "dev/newstest2013.de")
     ],
 ]
@@ -307,7 +296,7 @@ def bi_vocabs_token_generator(source_path,
 ]
 _ENFR_TEST_DATASETS = [
     [
-        "http://data.statmt.org/wmt16/translation-task/dev.tgz",
+        "http://data.statmt.org/wmt17/translation-task/dev.tgz",
         ("dev/newstest2013.en", "dev/newstest2013.fr")
     ],
 ]
@@ -337,6 +326,29 @@ def bi_vocabs_token_generator(source_path,
     ("dev.mk", "dev.en")
 ]]
 
+# English-Czech datasets
+_ENCS_TRAIN_DATASETS = [
+    [
+        "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
+        ("training-parallel-nc-v11/news-commentary-v11.cs-en.en",
+         "training-parallel-nc-v11/news-commentary-v11.cs-en.cs")
+    ],
+    [
+        "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
+        ("commoncrawl.cs-en.en", "commoncrawl.cs-en.cs")
+    ],
+    [
+        "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
+        ("training/europarl-v7.cs-en.en", "training/europarl-v7.cs-en.cs")
+    ],
+]
+_ENCS_TEST_DATASETS = [
+    [
+        "http://data.statmt.org/wmt17/translation-task/dev.tgz",
+        ("dev/newstest2013.en", "dev/newstest2013.cs")
+    ],
+]
+
 
 # Generators.
 
@@ -408,16 +420,6 @@ def _compile_data(tmp_dir, datasets, filename):
   return filename
 
 
-def ende_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size):
-  symbolizer_vocab = generator_utils.get_or_generate_vocab(
-      data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size)
-  datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
-  tag = "train" if train else "dev"
-  data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag)
-  return token_generator(data_path + ".lang1", data_path + ".lang2",
-                         symbolizer_vocab, EOS)
-
-
 @registry.register_problem("wmt_ende_tokens_8k")
 class WMTEnDeTokens8k(WMTProblem):
   """Problem spec for WMT En-De translation."""
@@ -426,9 +428,14 @@ class WMTEnDeTokens8k(WMTProblem):
   def targeted_vocab_size(self):
     return 2**13  # 8192
 
-  @property
-  def train_generator(self):
-    return ende_wordpiece_token_generator
+  def train_generator(self, data_dir, tmp_dir, train):
+    symbolizer_vocab = generator_utils.get_or_generate_vocab(
+        data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size)
+    datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
+    tag = "train" if train else "dev"
+    data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag)
+    return token_generator(data_path + ".lang1", data_path + ".lang2",
+                           symbolizer_vocab, EOS)
 
   @property
   def input_space_id(self):
@@ -447,15 +454,6 @@ def targeted_vocab_size(self):
     return 2**15  # 32768
 
 
-def ende_character_generator(tmp_dir, train):
-  character_vocab = text_encoder.ByteTextEncoder()
-  datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
-  tag = "train" if train else "dev"
-  data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag)
-  return character_generator(data_path + ".lang1", data_path + ".lang2",
-                             character_vocab, EOS)
-
-
 @registry.register_problem("wmt_ende_characters")
 class WMTEnDeCharacters(WMTProblem):
   """Problem spec for WMT En-De translation."""
@@ -464,9 +462,13 @@ class WMTEnDeCharacters(WMTProblem):
   def is_character_level(self):
     return True
 
-  @property
-  def train_generator(self):
-    return ende_character_generator
+  def train_generator(self, tmp_dir, train):
+    character_vocab = text_encoder.ByteTextEncoder()
+    datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
+    tag = "train" if train else "dev"
+    data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag)
+    return character_generator(data_path + ".lang1", data_path + ".lang2",
+                               character_vocab, EOS)
 
   @property
   def input_space_id(self):
@@ -477,29 +479,6 @@ def target_space_id(self):
     return problem.SpaceID.DE_CHR
 
 
-def zhen_wordpiece_token_bigenerator(data_dir, tmp_dir, train,
-                                     source_vocab_size, target_vocab_size):
-  """Wordpiece generator for the WMT'17 zh-en dataset."""
-  datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
-  source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS]
-  target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS]
-  source_vocab = generator_utils.get_or_generate_vocab(
-      data_dir, tmp_dir, "vocab.zh.%d" % source_vocab_size,
-      source_vocab_size, source_datasets)
-  target_vocab = generator_utils.get_or_generate_vocab(
-      data_dir, tmp_dir, "vocab.en.%d" % target_vocab_size,
-      target_vocab_size, target_datasets)
-  tag = "train" if train else "dev"
-  data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
-  return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
-                                   source_vocab, target_vocab, EOS)
-
-
-def zhen_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size):
-  return zhen_wordpiece_token_bigenerator(data_dir, tmp_dir, train,
-                                          vocab_size, vocab_size)
-
-
 @registry.register_problem("wmt_zhen_tokens_8k")
 class WMTZhEnTokens8k(WMTProblem):
   """Problem spec for WMT Zh-En translation."""
@@ -508,9 +487,22 @@ class WMTZhEnTokens8k(WMTProblem):
   def targeted_vocab_size(self):
     return 2**13  # 8192
 
-  @property
-  def train_generator(self):
-    return zhen_wordpiece_token_generator
+  def train_generator(self, data_dir, tmp_dir, train):
+    source_vocab_size = self.targeted_vocab_size
+    target_vocab_size = self.targeted_vocab_size
+    datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
+    source_datasets = [[item[0], [item[1][0]]] for item in datasets]
+    target_datasets = [[item[0], [item[1][1]]] for item in datasets]
+    source_vocab = generator_utils.get_or_generate_vocab(
+        data_dir, tmp_dir, "vocab.zh.%d" % source_vocab_size, source_vocab_size,
+        source_datasets)
+    target_vocab = generator_utils.get_or_generate_vocab(
+        data_dir, tmp_dir, "vocab.en.%d" % target_vocab_size, target_vocab_size,
+        target_datasets)
+    tag = "train" if train else "dev"
+    data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
+    return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
+                                     source_vocab, target_vocab, EOS)
 
   @property
   def input_space_id(self):
@@ -542,17 +534,6 @@ def targeted_vocab_size(self):
     return 2**15  # 32768
 
 
-def enfr_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size):
-  """Instance of token generator for the WMT en->fr task."""
-  symbolizer_vocab = generator_utils.get_or_generate_vocab(
-      data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size)
-  datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
-  tag = "train" if train else "dev"
-  data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag)
-  return token_generator(data_path + ".lang1", data_path + ".lang2",
-                         symbolizer_vocab, EOS)
-
-
 @registry.register_problem("wmt_enfr_tokens_8k")
 class WMTEnFrTokens8k(WMTProblem):
   """Problem spec for WMT En-Fr translation."""
@@ -561,9 +542,14 @@ class WMTEnFrTokens8k(WMTProblem):
   def targeted_vocab_size(self):
     return 2**13  # 8192
 
-  @property
-  def train_generator(self):
-    return enfr_wordpiece_token_generator
+  def train_generator(self, data_dir, tmp_dir, train):
+    symbolizer_vocab = generator_utils.get_or_generate_vocab(
+        data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size)
+    datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
+    tag = "train" if train else "dev"
+    data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag)
+    return token_generator(data_path + ".lang1", data_path + ".lang2",
+                           symbolizer_vocab, EOS)
 
   @property
   def input_space_id(self):
@@ -582,16 +568,6 @@ def targeted_vocab_size(self):
     return 2**15  # 32768
 
 
-def enfr_character_generator(tmp_dir, train):
-  """Instance of character generator for the WMT en->fr task."""
-  character_vocab = text_encoder.ByteTextEncoder()
-  datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
-  tag = "train" if train else "dev"
-  data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag)
-  return character_generator(data_path + ".lang1", data_path + ".lang2",
-                             character_vocab, EOS)
-
-
 @registry.register_problem("wmt_enfr_characters")
 class WMTEnFrCharacters(WMTProblem):
   """Problem spec for WMT En-Fr translation."""
@@ -600,9 +576,13 @@ class WMTEnFrCharacters(WMTProblem):
   def is_character_level(self):
     return True
 
-  @property
-  def train_generator(self):
-    return enfr_character_generator
+  def train_generator(self, data_dir, tmp_dir, train):
+    character_vocab = text_encoder.ByteTextEncoder()
+    datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
+    tag = "train" if train else "dev"
+    data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag)
+    return character_generator(data_path + ".lang1", data_path + ".lang2",
+                               character_vocab, EOS)
 
   @property
   def input_space_id(self):
@@ -613,20 +593,6 @@ def target_space_id(self):
     return problem.SpaceID.FR_CHR
 
 
-def mken_wordpiece_token_generator(data_dir, tmp_dir, train, vocab_size):
-  """Wordpiece generator for the SETimes Mk-En dataset."""
-  datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
-  source_datasets = [[item[0], [item[1][0]]] for item in _MKEN_TRAIN_DATASETS]
-  target_datasets = [[item[0], [item[1][1]]] for item in _MKEN_TRAIN_DATASETS]
-  symbolizer_vocab = generator_utils.get_or_generate_vocab(
-      data_dir, tmp_dir, "vocab.mken.%d" % vocab_size, vocab_size,
-      source_datasets + target_datasets)
-  tag = "train" if train else "dev"
-  data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag)
-  return token_generator(data_path + ".lang1", data_path + ".lang2",
-                         symbolizer_vocab, EOS)
-
-
 @registry.register_problem("setimes_mken_tokens_32k")
 class SETimesMkEnTokens32k(WMTProblem):
   """Problem spec for SETimes Mk-En translation."""
@@ -636,8 +602,20 @@ def targeted_vocab_size(self):
     return 2**15  # 32768
 
   @property
-  def train_generator(self):
-    return mken_wordpiece_token_generator
+  def vocab_name(self):
+    return "vocab.mken"
+
+  def train_generator(self, data_dir, tmp_dir, train):
+    datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
+    source_datasets = [[item[0], [item[1][0]]] for item in datasets]
+    target_datasets = [[item[0], [item[1][1]]] for item in datasets]
+    symbolizer_vocab = generator_utils.get_or_generate_vocab(
+        data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
+        source_datasets + target_datasets)
+    tag = "train" if train else "dev"
+    data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag)
+    return token_generator(data_path + ".lang1", data_path + ".lang2",
+                           symbolizer_vocab, EOS)
 
   @property
   def input_space_id(self):
@@ -648,12 +626,62 @@ def target_space_id(self):
     return problem.SpaceID.EN_TOK
 
 
-def parsing_character_generator(tmp_dir, train):
-  character_vocab = text_encoder.ByteTextEncoder()
-  filename = "parsing_%s" % ("train" if train else "dev")
-  text_filepath = os.path.join(tmp_dir, filename + ".text")
-  tags_filepath = os.path.join(tmp_dir, filename + ".tags")
-  return character_generator(text_filepath, tags_filepath, character_vocab, EOS)
+@registry.register_problem("wmt_encs_tokens_32k")
+class WMTEnCsTokens32k(problem.Problem):
+  """Problem spec for WMT English-Czech translation."""
+
+  @property
+  def target_vocab_size(self):
+    return 2**15  # 32768
+
+  @property
+  def vocab_name(self):
+    return "vocab.encs"
+
+  def train_generator(self, data_dir, tmp_dir, train):
+    datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
+    source_datasets = [[item[0], [item[1][0]]] for item in datasets]
+    target_datasets = [[item[0], [item[1][1]]] for item in datasets]
+    symbolizer_vocab = generator_utils.get_or_generate_vocab(
+        data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
+        source_datasets + target_datasets)
+    tag = "train" if train else "dev"
+    data_path = _compile_data(tmp_dir, datasets, "wmt_encs_tok_%s" % tag)
+    return token_generator(data_path + ".lang1", data_path + ".lang2",
+                           symbolizer_vocab, EOS)
+
+  @property
+  def input_space_id(self):
+    return problem.SpaceID.EN_TOK
+
+  @property
+  def target_space_id(self):
+    return problem.SpaceID.CS_TOK
+
+
+@registry.register_problem("wmt_encs_characters")
+class WMTEnCsCharacters(WMTProblem):
+  """Problem spec for WMT En-Cs character-based translation."""
+
+  @property
+  def is_character_level(self):
+    return True
+
+  def train_generator(self, data_dir, tmp_dir, train):
+    character_vocab = text_encoder.ByteTextEncoder()
+    datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
+    tag = "train" if train else "dev"
+    data_path = _compile_data(tmp_dir, datasets, "wmt_encs_chr_%s" % tag)
+    return character_generator(data_path + ".lang1", data_path + ".lang2",
+                               character_vocab, EOS)
+
+  @property
+  def input_space_id(self):
+    return problem.SpaceID.EN_CHR
+
+  @property
+  def target_space_id(self):
+    return problem.SpaceID.CS_CHR
 
 
 def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix,
diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py
index 4f694a7f9..624623f4c 100644
--- a/tensor2tensor/models/common_attention.py
+++ b/tensor2tensor/models/common_attention.py
@@ -66,6 +66,9 @@ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
       tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
   scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
   signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
+  signal = tf.reshape(signal, [length, 2, num_timescales])
+  signal = tf.transpose(signal, perm=[0, 2, 1])
+  signal = tf.reshape(signal, [length, channels])
   signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
   signal = tf.reshape(signal, [1, length, channels])
   return x + signal
diff --git a/tensor2tensor/utils/get_ende_bleu.sh b/tensor2tensor/utils/get_ende_bleu.sh
index 09078414f..3493af74c 100755
--- a/tensor2tensor/utils/get_ende_bleu.sh
+++ b/tensor2tensor/utils/get_ende_bleu.sh
@@ -5,10 +5,8 @@ tok_gold_targets=newstest2013.tok.de
 
 decodes_file=$1
 
-cut -d'	' -f1 $decodes_file > $decodes_file.target
-
 # Tokenize.
-perl $mosesdecoder/scripts/tokenizer/tokenizer.perl -l de < $decodes_file.target > $decodes_file.tok
+perl $mosesdecoder/scripts/tokenizer/tokenizer.perl -l de < $decodes_file > $decodes_file.tok
 
 # Put compounds in ATAT format (comparable to papers like GNMT, ConvS2S).
 # See https://nlp.stanford.edu/projects/nmt/ :
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index c4bdcf942..0943881f3 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -722,8 +722,7 @@ def log_fn(inputs, outputs):
   tf.logging.info("Writing decodes into %s" % decode_filename)
   outfile = tf.gfile.Open(decode_filename, "w")
   for index in range(len(sorted_inputs)):
-    outfile.write("%s\t%s\n" % (decodes[sorted_keys[index]],
-                                sorted_inputs[sorted_keys[index]]))
+    outfile.write("%s\n" % (decodes[sorted_keys[index]]))
 
 
 def decode_interactively(estimator):

From bea499320874dc617631c52632f43ffd782542b7 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Tue, 25 Jul 2017 09:05:13 -0700
Subject: [PATCH 07/21] Clean up some code around escaping/unescaping tokens
 and add tests.

PiperOrigin-RevId: 163077617
---
 tensor2tensor/data_generators/text_encoder.py | 248 +++++++++---------
 .../data_generators/text_encoder_test.py      |  68 +++++
 2 files changed, 199 insertions(+), 117 deletions(-)
 create mode 100644 tensor2tensor/data_generators/text_encoder_test.py

diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 7c53784f3..afe1da9ae 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -24,15 +24,12 @@
 from __future__ import division
 from __future__ import print_function
 
-from collections import defaultdict
+import collections
 import re
 
 # Dependency imports
 
 import six
-from six import PY2
-from six import unichr  # pylint: disable=redefined-builtin
-from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensor2tensor.data_generators import tokenizer
 
 import tensorflow as tf
@@ -46,7 +43,7 @@
 PAD_ID = RESERVED_TOKENS.index(PAD)  # Normally 0
 EOS_ID = RESERVED_TOKENS.index(EOS)  # Normally 1
 
-if PY2:
+if six.PY2:
   RESERVED_TOKENS_BYTES = RESERVED_TOKENS
 else:
   RESERVED_TOKENS_BYTES = [bytes(PAD, "ascii"), bytes(EOS, "ascii")]
@@ -56,18 +53,17 @@
 # '\u' is converted to '_'
 # '\\' is converted to '\'
 # '\213;' is converted to unichr(213)
-_UNESCAPE_REGEX = re.compile(u"|".join([r"\\u", r"\\\\", r"\\([0-9]+);"]))
+_UNESCAPE_REGEX = re.compile(ur"\\u|\\\\|\\([0-9]+);")
+_ESCAPE_CHARS = set(u"\\_;0123456789")
 
 
 def native_to_unicode_py2(s):
   """Python 2: transform native string to Unicode."""
-  if isinstance(s, unicode):
-    return s
-  return s.decode("utf-8")
+  return s if isinstance(s, unicode) else s.decode("utf8")
 
 
 # Conversion between Unicode and UTF-8, if required (on Python2)
-if PY2:
+if six.PY2:
   native_to_unicode = native_to_unicode_py2
   unicode_to_native = lambda s: s.encode("utf-8")
 else:
@@ -131,7 +127,7 @@ class ByteTextEncoder(TextEncoder):
 
   def encode(self, s):
     numres = self._num_reserved_ids
-    if PY2:
+    if six.PY2:
       return [ord(c) + numres for c in s]
     # Python3: explicitly convert to UTF-8
     return [c + numres for c in s.encode("utf-8")]
@@ -145,7 +141,7 @@ def decode(self, ids):
         decoded_ids.append(RESERVED_TOKENS_BYTES[int(id_)])
       else:
         decoded_ids.append(int2byte(id_ - numres))
-    if PY2:
+    if six.PY2:
       return "".join(decoded_ids)
     # Python3: join byte arrays and then decode string
     return b"".join(decoded_ids).decode("utf-8", "replace")
@@ -199,6 +195,55 @@ def _load_vocab_from_file(self, filename):
         self._id_to_token[idx] = tok
 
 
+def _escape_token(token, alphabet):
+  """Escape away underscores and OOV characters and append '_'.
+
+  This allows the token to be experessed as the concatenation of a list
+  of subtokens from the vocabulary. The underscore acts as a sentinel
+  which allows us to invertibly concatenate multiple such lists.
+
+  Args:
+    token: A unicode string to be escaped.
+    alphabet: A set of all characters in the vocabulary's alphabet.
+
+  Returns:
+    escaped_token: An escaped unicode string.
+
+  Raises:
+    ValueError: If the provided token is not unicode.
+  """
+  if not isinstance(token, six.text_type):
+    raise ValueError("Expected string type for token, got %s" % type(token))
+
+  token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u")
+  ret = [
+      c if c in alphabet and c != u"\n" else ur"\%d;" % ord(c)
+      for c in token]
+  return u"".join(ret) + "_"
+
+
+def _unescape_token(escaped_token):
+  """Inverse of _escape_token().
+
+  Args:
+    escaped_token: a unicode string
+
+  Returns:
+    token: a unicode string
+  """
+  def match(m):
+    if m.group(1) is None:
+      return u"_" if m.group(0) == u"\\u" else u"\\"
+
+    try:
+      return six.unichr(int(m.group(1)))
+    except (ValueError, OverflowError) as _:
+      return ""
+
+  trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token
+  return _UNESCAPE_REGEX.sub(match, trimmed)
+
+
 class SubwordTextEncoder(TextEncoder):
   """Class for invertibly encoding text using a limited vocabulary.
 
@@ -276,7 +321,8 @@ def _tokens_to_subtokens(self, tokens):
     """
     ret = []
     for token in tokens:
-      ret.extend(self._escaped_token_to_subtokens(self._escape_token(token)))
+      ret.extend(self._escaped_token_to_subtokens(
+          _escape_token(token, self._alphabet)))
     return ret
 
   def _subtokens_to_tokens(self, subtokens):
@@ -290,7 +336,7 @@ def _subtokens_to_tokens(self, subtokens):
     concatenated = "".join(
         [self._subtoken_to_subtoken_string(s) for s in subtokens])
     split = concatenated.split("_")
-    return [self._unescape_token(t + "_") for t in split if t]
+    return [_unescape_token(t + "_") for t in split if t]
 
   def _subtoken_to_subtoken_string(self, subtoken):
     """Subtoken_String (string) corresponding to the given subtoken (id)."""
@@ -312,12 +358,17 @@ def _escaped_token_to_subtokens(self, escaped_token):
     while pos < lesc:
       end = min(lesc, pos + self._max_subtoken_len)
       while end > pos:
-        subtoken = self._subtoken_string_to_id.get(escaped_token[pos:end], -1)
-        if subtoken != -1:
+        subtoken_id = self._subtoken_string_to_id.get(escaped_token[pos:end])
+        if subtoken_id is not None:
           break
         end -= 1
-      assert end > pos
-      ret.append(subtoken)
+
+      # If there is no possible encoding of the escaped token then one of the
+      # characters in the token is not in the alphabet. This should be
+      # impossible and would be indicative of a bug.
+      assert subtoken_id is not None
+
+      ret.append(subtoken_id)
       pos = end
 
     return ret
@@ -331,27 +382,37 @@ def build_to_target_size(cls,
                            num_iterations=4):
     """Builds a SubwordTextEncoder that has `vocab_size` near `target_size`.
 
-    Uses simple recursive binary search to find a `min_count` value that most
+    Uses simple recursive binary search to find a minimum token count that most
     closely matches the `target_size`.
 
     Args:
-      target_size: desired vocab_size to approximate.
-      token_counts: a dictionary of string to int.
-      min_val: an integer - lower bound for `min_count`.
-      max_val: an integer - upper bound for `min_count`.
-      num_iterations: an integer.  how many iterations of refinement.
+      target_size: Desired vocab_size to approximate.
+      token_counts: A dictionary of token counts, mapping string to int.
+      min_val: An integer; lower bound for the minimum token count.
+      max_val: An integer; upper bound for the minimum token count.
+      num_iterations: An integer; how many iterations of refinement.
 
     Returns:
-      a SubwordTextEncoder instance.
+      A SubwordTextEncoder instance.
+
+    Raises:
+      ValueError: If `min_val` is greater than `max_val`.
     """
+    if min_val > max_val:
+      raise ValueError(
+          "Lower bound for the minimum token count "
+          "is greater than the upper bound.")
+
     def bisect(min_val, max_val):
       """Bisection to find the right size."""
       present_count = (max_val + min_val) // 2
       tf.logging.info("Trying min_count %d" % present_count)
       subtokenizer = cls()
-      subtokenizer.build_from_token_counts(token_counts,
-                                           present_count, num_iterations)
-      if min_val >= max_val or subtokenizer.vocab_size == target_size:
+      subtokenizer.build_from_token_counts(
+          token_counts, present_count, num_iterations)
+
+      # If min_val == max_val, we can't do any better than this.
+      if subtokenizer.vocab_size == target_size or min_val == max_val:
         return subtokenizer
 
       if subtokenizer.vocab_size > target_size:
@@ -382,34 +443,27 @@ def build_from_token_counts(self,
       num_iterations: an integer.  how many iterations of refinement.
       num_reserved_ids: an integer.  how many ids to reserve for special tokens.
     """
-    # first determine the alphabet to include all characters with count at
-    # least min_count in the dataset.
-    char_counts = defaultdict(int)
-    for token, count in six.iteritems(token_counts):
-      for c in token:
-        char_counts[c] += count
-    self._alphabet = set()
-    for c, count in six.iteritems(char_counts):
-      if count >= min_count:
-        self._alphabet.add(c)
-    # Make sure all characters needed for escaping are included
-    for c in u"\\_;0123456789":
-      self._alphabet.add(c)
+    self._init_alphabet_from_tokens(six.iterkeys(token_counts))
+
+    # Bootstrap the initial list of subtokens with the characters from the
+    # alphabet plus the escaping characters.
+    self._init_subtokens_from_list(
+        list(self._alphabet), reserved=num_reserved_ids)
 
     # We build iteratively.  On each iteration, we segment all the words,
     # then count the resulting potential subtokens, keeping the ones
     # with high enough counts for our new vocabulary.
     if min_count < 1:
       min_count = 1
-    for i in xrange(num_iterations):
+    for i in six.moves.range(num_iterations):
       tf.logging.info("Iteration {0}".format(i))
-      counts = defaultdict(int)
+      counts = collections.defaultdict(int)
       for token, count in six.iteritems(token_counts):
-        escaped_token = self._escape_token(token)
+        escaped_token = _escape_token(token, self._alphabet)
         # we will count all tails of the escaped_token, starting from boundaries
         # determined by our current segmentation.
         if i == 0:
-          starts = xrange(len(escaped_token))
+          starts = six.moves.range(len(escaped_token))
         else:
           subtokens = self._escaped_token_to_subtokens(escaped_token)
           pos = 0
@@ -418,48 +472,43 @@ def build_from_token_counts(self,
             starts.append(pos)
             pos += len(self._all_subtoken_strings[subtoken])
         for start in starts:
-          for end in xrange(start + 1, len(escaped_token) + 1):
+          for end in six.moves.range(start + 1, len(escaped_token) + 1):
             subtoken_string = escaped_token[start:end]
             counts[subtoken_string] += count
-      # Make sure all characters needed for escaping are included
-      for c in self._alphabet:
-        counts[c] += min_count
       # Array of sets of candidate subtoken strings, by length
       len_to_subtoken_strings = []
       for subtoken_string, count in six.iteritems(counts):
         lsub = len(subtoken_string)
-        if count >= min_count:
+        # Always include all the alphabet characters or some strings will
+        # be unencodeable.
+        if count >= min_count or subtoken_string in self._alphabet:
           # Add this subtoken string to its length set
           while len(len_to_subtoken_strings) <= lsub:
             len_to_subtoken_strings.append(set())
           len_to_subtoken_strings[lsub].add(subtoken_string)
       new_subtoken_strings = []
-      # consider the candidates longest to shortest, so that if we accept
+      # Consider the candidates longest to shortest, so that if we accept
       # a longer subtoken string, we can decrement the counts of its prefixes.
-      for lsub in reversed(range(1, len(len_to_subtoken_strings))):
+      for lsub in six.moves.range(len(len_to_subtoken_strings)-1, 0, -1):
         subtoken_strings = len_to_subtoken_strings[lsub]
         for subtoken_string in subtoken_strings:
           count = counts[subtoken_string]
-          if count >= min_count:
-            new_subtoken_strings.append((count, subtoken_string))
-            for l in xrange(1, lsub):
+          if count >= min_count or subtoken_string in self._alphabet:
+            # Exclude alphabet tokens here, as they must be included later
+            # explicitly, regardless of count.
+            if subtoken_string not in self._alphabet:
+              new_subtoken_strings.append((count, subtoken_string))
+            for l in six.moves.range(1, lsub):
               counts[subtoken_string[:l]] -= count
-      # Sort in decreasing order by count
       new_subtoken_strings.sort(reverse=True)
-      # Now we have a candidate vocabulary
-      old_alphabet = self._alphabet
-      self._init_from_list([u""] * num_reserved_ids +
-                           [p[1] for p in new_subtoken_strings])
-      assert old_alphabet == self._alphabet
-      tf.logging.info("vocab_size = %d" % self.vocab_size)
 
-    original = "This sentence was encoded by the SubwordTextEncoder."
-    encoded = self.encode(original)
-    print(encoded)
-    print([self._subtoken_to_subtoken_string(s) for s in encoded])
-    decoded = self.decode(encoded)
-    print(decoded)
-    assert decoded == original
+      # Reinitialize to the candidate vocabulary, including the alphabet
+      # explicitly as the highest priority.
+      self._init_subtokens_from_list(
+          list(self._alphabet) +
+          [subtoken for _, subtoken in new_subtoken_strings],
+          reserved=num_reserved_ids)
+      tf.logging.info("vocab_size = %d" % self.vocab_size)
 
   def dump(self):
     """Debugging dump of the current subtoken vocabulary."""
@@ -468,15 +517,21 @@ def dump(self):
     print(u", ".join(u"{0} : '{1}'".format(i, s)
                      for i, s in sorted(subtoken_strings)))
 
-  def _init_from_list(self, subtoken_strings):
-    """Initialize from a list of subtoken strings."""
-    self._all_subtoken_strings = subtoken_strings
+  def _init_subtokens_from_list(self, subtoken_strings, reserved=0):
+    """Initialize token information from a list of subtoken strings."""
+    self._all_subtoken_strings = [u""] * reserved + subtoken_strings
     # we remember the maximum length of any subtoken to avoid having to
     # check arbitrarily long strings.
     self._max_subtoken_len = max([len(s) for s in subtoken_strings])
     self._subtoken_string_to_id = {
-        s: i for i, s in enumerate(subtoken_strings) if s}
-    self._alphabet = set([c for c in subtoken_strings if len(c) == 1])
+        s: i+reserved for i, s in enumerate(subtoken_strings) if s}
+
+  def _init_alphabet_from_tokens(self, tokens):
+    """Initialize alphabet from an iterable of token or subtoken strings."""
+    # Include all characters from all tokens in the alphabet to guarantee that
+    # any token can be encoded. Additionally, include all escaping characters.
+    self._alphabet = {c for token in tokens for c in token}
+    self._alphabet |= _ESCAPE_CHARS
 
   def _load_from_file(self, filename):
     """Load from a file."""
@@ -484,51 +539,10 @@ def _load_from_file(self, filename):
     with tf.gfile.Open(filename) as f:
       for line in f:
         subtoken_strings.append(native_to_unicode(line.strip()[1:-1]))
-    self._init_from_list(subtoken_strings)
+    self._init_subtokens_from_list(subtoken_strings)
+    self._init_alphabet_from_tokens(subtoken_strings)
 
   def store_to_file(self, filename):
     with tf.gfile.Open(filename, "w") as f:
       for subtoken_string in self._all_subtoken_strings:
         f.write("'" + unicode_to_native(subtoken_string) + "'\n")
-
-  def _escape_token(self, token):
-    """Escape away underscores and OOV characters and append '_'.
-
-    This allows the token to be experessed as the concatenation of a list
-    of subtokens from the vocabulary.  The underscore acts as a sentinel
-    which allows us to invertibly concatenate multiple such lists.
-
-    Args:
-      token: a unicode string
-    Returns:
-      escaped_token: a unicode string
-    """
-    assert isinstance(token, six.text_type)
-    token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u") + u"_"
-    ret = u""
-    for c in token:
-      if c in self._alphabet and c != u"\n":
-        ret += c
-      else:
-        ret += u"\\%d;" % ord(c)
-    return ret
-
-  def _unescape_token(self, escaped_token):
-    """Inverse of _escape_token().
-
-    Args:
-      escaped_token: a unicode string
-    Returns:
-      token: a unicode string
-    """
-    def match(m):
-      if m.group(1) is not None:
-        # Convert '\213;' to unichr(213)
-        try:
-          return unichr(int(m.group(1)))
-        except (ValueError, OverflowError) as _:
-          return ""
-      # Convert '\u' to '_' and '\\' to '\'
-      return u"_" if m.group(0) == u"\\u" else u"\\"
-    # Cut off the trailing underscore and apply the regex substitution
-    return _UNESCAPE_REGEX.sub(match, escaped_token[:-1])
diff --git a/tensor2tensor/data_generators/text_encoder_test.py b/tensor2tensor/data_generators/text_encoder_test.py
new file mode 100644
index 000000000..7ac2ba911
--- /dev/null
+++ b/tensor2tensor/data_generators/text_encoder_test.py
@@ -0,0 +1,68 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensor2tensor.data_generators.text_encoder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.data_generators import text_encoder
+import tensorflow as tf
+
+
+class EscapeUnescapeTokenTest(tf.test.TestCase):
+
+  def test_escape_token(self):
+    escaped = text_encoder._escape_token(
+        u'Foo! Bar.\nunder_score back\\slash',
+        set('abcdefghijklmnopqrstuvwxyz .\n') | text_encoder._ESCAPE_CHARS)
+
+    self.assertEqual(
+        u'\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_', escaped)
+
+  def test_unescape_token(self):
+    unescaped = text_encoder._unescape_token(
+        u'\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_')
+
+    self.assertEqual(
+        u'Foo! Bar.\nunder_score back\\slash', unescaped)
+
+
+class SubwordTextEncoderTest(tf.test.TestCase):
+
+  def test_encode_decode(self):
+    token_counts = {
+        u'this': 9,
+        u'sentence': 14,
+        u'the': 100,
+        u'encoded': 1,
+        u'was': 20,
+        u'by': 50,
+    }
+    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
+        50, token_counts, 2, 10)
+    encoder.build_from_token_counts(token_counts, min_count=2)
+
+    original = 'This sentence was encoded by the SubwordTextEncoder.'
+    encoded = encoder.encode(original)
+    decoded = encoder.decode(encoded)
+    self.assertEqual(original, decoded)
+
+
+if __name__ == '__main__':
+  tf.test.main()

From 83a757dd43df099b3c545e5cd2e9f9f9f0aed50b Mon Sep 17 00:00:00 2001
From: Ashish Vaswani <avaswani@google.com>
Date: Tue, 25 Jul 2017 12:04:03 -0700
Subject: [PATCH 08/21] Adding encoder conv attention. A query block attends to
 a neighborhood to the left and the right of it. Pair programmed (Ashish +
 Niki)

PiperOrigin-RevId: 163103460
---
 tensor2tensor/models/common_attention.py | 97 +++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 4 deletions(-)

diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py
index 624623f4c..98a198f85 100644
--- a/tensor2tensor/models/common_attention.py
+++ b/tensor2tensor/models/common_attention.py
@@ -435,6 +435,91 @@ def local(x):
     return output
 
 
+def unmasked_local_attention_1d(q, k, v, block_length=128, filter_width=100,
+                                name=None):
+  """strided block local self-attention.
+
+  Args:
+    q: a Tensor with shape [batch, heads, length, depth_k]
+    k: a Tensor with shape [batch, heads, length, depth_k]
+    v: a Tensor with shape [batch, heads, length, depth_v]
+    block_length: an integer
+    filter_width: an integer indicating how much to look left.
+    name: an optional string
+
+  Returns:
+    a Tensor of shape [batch, heads, length, depth_v]
+  """
+  with tf.variable_scope(name, default_name="local_self_attention_1d",
+                         values=[q, k, v]):
+    v_shape = v.get_shape()
+    depth_v = tf.shape(v)[3]
+    batch_size = tf.shape(q)[0]
+    num_heads = tf.shape(q)[1]
+    original_length = tf.shape(q)[2]
+    # making sure q is a multiple of d
+    def pad_to_multiple(x, pad_length):
+      x_length = tf.shape(x)[2]
+      return tf.pad(x, [[0, 0], [0, 0], [0, -x_length % pad_length], [0, 0]])
+    def pad_l_and_r(x, pad_length):
+      return tf.pad(x, [[0, 0], [0, 0], [pad_length, pad_length], [0, 0]])
+    q = pad_to_multiple(q, block_length)
+    k = pad_to_multiple(k, block_length)
+    v = pad_to_multiple(v, block_length)
+
+    # Setting up q blocks
+    new_q_shape = tf.shape(q)
+    # Setting up q blocks
+    q = tf.reshape(q, [new_q_shape[0], new_q_shape[1],
+                       new_q_shape[2]//block_length,
+                       block_length, new_q_shape[3]])
+
+    # Setting up k and v values
+    k = pad_l_and_r(k, filter_width)
+    v = pad_l_and_r(v, filter_width)
+
+    length = tf.shape(k)[2]
+    full_filter_width = block_length + 2*filter_width
+    # getting gather indices
+    indices = tf.range(0, length, delta=1, name="index_range")
+    # making indices [1, length, 1] to appy convs
+    indices = tf.reshape(indices, [1, -1, 1])
+    kernel = tf.expand_dims(tf.eye(full_filter_width), axis=1)
+    gather_indices = tf.nn.conv1d(
+        tf.cast(indices, tf.float32),
+        kernel,
+        block_length,
+        padding="VALID",
+        name="gather_conv")
+
+    gather_indices = tf.squeeze(tf.cast(gather_indices, tf.int32), axis=0)
+
+    # [length, batch, heads, dim]
+    k_t = tf.transpose(k, [2, 0, 1, 3])
+    k_new = tf.gather(k_t, gather_indices)
+
+    # [batch, heads, blocks, block_length, dim]
+    k_new = tf.transpose(k_new, [2, 3, 0, 1, 4])
+
+    attention_bias = tf.expand_dims(
+        tf.to_float(embedding_to_padding(k_new)) * -1e9, axis=-2)
+
+    v_t = tf.transpose(v, [2, 0, 1, 3])
+    v_new = tf.gather(v_t, gather_indices)
+    v_new = tf.transpose(v_new, [2, 3, 0, 1, 4])
+
+    logits = tf.matmul(q, k_new, transpose_b=True)
+
+    attention = tf.nn.softmax(logits+attention_bias)
+    output = tf.matmul(attention, v_new)
+
+    output = tf.reshape(output, [batch_size, num_heads, -1, depth_v])
+    # Remove the padding if introduced
+    output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1])
+    output.set_shape(v_shape)
+    return output
+
+
 def multihead_attention(query_antecedent,
                         memory_antecedent,
                         bias,
@@ -460,8 +545,9 @@ def multihead_attention(query_antecedent,
     dropout_rate: a floating point number
     image_shapes: optional tuple of integer scalars.
       see comments for attention_image_summary()
-    attention_type: a string, either "dot_product" or "local_mask_right"
-    block_length: an integer - relevent for "local_mask_right"
+    attention_type: a string, either "dot_product" or "local_mask_right" or
+                    "local_unmasked"
+    block_length: an integer - relevant for "local_mask_right"
     name: an optional string
 
   Returns:
@@ -509,9 +595,11 @@ def multihead_attention(query_antecedent,
     if attention_type == "dot_product":
       x = dot_product_attention(
           q, k, v, bias, dropout_rate, image_shapes)
-    else:
-      assert attention_type == "local_mask_right"
+    elif attention_type == "local_mask_right":
       x = masked_local_attention_1d(q, k, v, block_length=block_length)
+    else:
+      assert attention_type == "local_unmasked"
+      x = unmasked_local_attention_1d(q, k, v, block_length=block_length)
     x = combine_heads(x)
     x = common_layers.conv1d(x, output_depth, 1, name="output_transform")
     return x
@@ -652,4 +740,5 @@ def parameter_attention(x,
     y = tf.reshape(y, [batch_size, length, total_value_depth])
     y.set_shape([None, None, total_value_depth])
     y = common_layers.conv1d(y, output_depth, 1, name="output_transform")
+
     return y

From d190b79861d849569e42d8ad892b337983df39eb Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Tue, 25 Jul 2017 15:20:01 -0700
Subject: [PATCH 09/21] Update inspect.py to allow decoding with
 TokenTextEncoder and ByteTextEncoder.

PiperOrigin-RevId: 163131045
---
 tensor2tensor/data_generators/inspect.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensor2tensor/data_generators/inspect.py b/tensor2tensor/data_generators/inspect.py
index 124c07017..6ba054d3c 100644
--- a/tensor2tensor/data_generators/inspect.py
+++ b/tensor2tensor/data_generators/inspect.py
@@ -34,6 +34,10 @@
 
 tf.app.flags.DEFINE_string("subword_text_encoder_filename", "",
                            "SubwordTextEncoder vocabulary file")
+tf.app.flags.DEFINE_string("token_text_encoder_filename", "",
+                           "TokenTextEncoder vocabulary file")
+tf.app.flags.DEFINE_bool("byte_text_encoder", False,
+                         "use a ByteTextEncoder")
 tf.app.flags.DEFINE_string("input_filename", "", "input filename")
 tf.app.flags.DEFINE_bool("print_inputs", False,
                          "Print decoded inputs to stdout")
@@ -48,6 +52,11 @@ def main(_):
   if FLAGS.subword_text_encoder_filename:
     encoder = text_encoder.SubwordTextEncoder(
         FLAGS.subword_text_encoder_filename)
+  elif FLAGS.token_text_encoder_filename:
+    encoder = text_encoder.TokenTextEncoder(
+        FLAGS.token_text_encoder_filename)
+  elif FLAGS.byte_text_encoder:
+    encoder = text_encoder.ByteTextEncoder()
   else:
     encoder = None
   reader = tf.python_io.tf_record_iterator(FLAGS.input_filename)

From 7de63bd1dac3482d6c2388b715b958d3726870c9 Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Tue, 25 Jul 2017 15:58:10 -0700
Subject: [PATCH 10/21] Character-level version of lm1b.

PiperOrigin-RevId: 163136520
---
 tensor2tensor/bin/t2t-datagen                   |  4 ++++
 tensor2tensor/data_generators/lm1b.py           | 10 +++++++---
 .../data_generators/problem_hparams.py          | 17 +++++++++++++++++
 tensor2tensor/data_generators/text_encoder.py   |  2 ++
 4 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index 783906d95..a9fa12255 100644
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -110,6 +110,10 @@ _SUPPORTED_PROBLEM_GENERATORS = {
         lambda: lm1b.generator(FLAGS.tmp_dir, True),
         lambda: lm1b.generator(FLAGS.tmp_dir, False)
     ),
+    "lm1b_characters": (
+        lambda: lm1b.generator(FLAGS.tmp_dir, True, characters=True),
+        lambda: lm1b.generator(FLAGS.tmp_dir, False, characters=True)
+    ),
     "wiki_32k": (
         lambda: wiki.generator(FLAGS.tmp_dir, True),
         1000
diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py
index 562435184..a436e0e6e 100644
--- a/tensor2tensor/data_generators/lm1b.py
+++ b/tensor2tensor/data_generators/lm1b.py
@@ -63,7 +63,7 @@ def _original_vocab(tmp_dir):
 def _replace_oov(original_vocab, line):
   """Replace out-of-vocab words with "UNK".
 
-  This maintains compatability with published results.
+  This maintains compatibility with published results.
 
   Args:
     original_vocab: a set of strings (The standard vocabulary for the dataset)
@@ -138,12 +138,13 @@ def _get_or_build_subword_text_encoder(tmp_dir):
   return ret
 
 
-def generator(tmp_dir, train):
+def generator(tmp_dir, train, characters=False):
   """Generator for lm1b sentences.
 
   Args:
     tmp_dir: a string.
     train: a boolean.
+    characters: a boolean
 
   Yields:
     A dictionary {"inputs": [0], "targets": [<subword ids>]}
@@ -152,7 +153,10 @@ def generator(tmp_dir, train):
   original_vocab = _original_vocab(tmp_dir)
   files = (_train_data_filenames(tmp_dir) if train
            else [_dev_data_filename(tmp_dir)])
-  encoder = _get_or_build_subword_text_encoder(tmp_dir)
+  if characters:
+    encoder = text_encoder.ByteTextEncoder()
+  else:
+    encoder = _get_or_build_subword_text_encoder(tmp_dir)
   for filepath in files:
     tf.logging.info("filepath = %s", filepath)
     for line in tf.gfile.Open(filepath):
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index 159ea6ac9..2792c79e9 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -336,6 +336,22 @@ def lm1b_32k(model_hparams):
   return p
 
 
+def lm1b_characters(unused_model_hparams):
+  """Billion-word language-modeling benchmark, 32k subword vocabulary."""
+  p = default_problem_hparams()
+  # ratio of dev tokens (including eos) to dev words (including eos)
+  # 826189 / 159658 = 5.174742
+  p.perplexity_exponent = 5.174742
+  p.input_modality = {}
+  encoder = text_encoder.ByteTextEncoder()
+  p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size)
+  p.vocabulary = {
+      "targets": encoder
+  }
+  p.target_space_id = 2
+  return p
+
+
 def wiki_32k(model_hparams):
   """Wikipedia title to article.  32k subtoken vocabulary."""
   p = default_problem_hparams()
@@ -623,6 +639,7 @@ def image_celeba(unused_model_hparams):
     "audio_wsj_characters_test": audio_wsj_characters,
     "audio_wsj_tokens_8k_tune": lambda p: audio_wsj_tokens(p, 2**13),
     "audio_wsj_tokens_8k_test": lambda p: audio_wsj_tokens(p, 2**13),
+    "lm1b_characters": lm1b_characters,
     "lm1b_32k": lm1b_32k,
     "wiki_32k": wiki_32k,
     "lmptb_10k": lmptb_10k,
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index afe1da9ae..6b01e3a35 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -128,6 +128,8 @@ class ByteTextEncoder(TextEncoder):
   def encode(self, s):
     numres = self._num_reserved_ids
     if six.PY2:
+      if isinstance(s, unicode):
+        s = s.encode("utf-8")
       return [ord(c) + numres for c in s]
     # Python3: explicitly convert to UTF-8
     return [c + numres for c in s.encode("utf-8")]

From fd1a87d214861ea8d8ec3079cd636b145aad7630 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Tue, 25 Jul 2017 17:06:12 -0700
Subject: [PATCH 11/21] Problem.eval_metrics

PiperOrigin-RevId: 163145397
---
 tensor2tensor/data_generators/genetics.py |   7 +-
 tensor2tensor/data_generators/problem.py  |  18 ++++
 tensor2tensor/data_generators/wmt.py      |  15 +++
 tensor2tensor/utils/metrics.py            | 109 ++++++++++++++--------
 tensor2tensor/utils/t2t_model.py          |   4 +-
 tensor2tensor/utils/trainer_utils.py      |  26 ++++--
 6 files changed, 129 insertions(+), 50 deletions(-)

diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py
index 848c2341b..309580d53 100644
--- a/tensor2tensor/data_generators/genetics.py
+++ b/tensor2tensor/data_generators/genetics.py
@@ -49,6 +49,7 @@
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.utils import metrics
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
@@ -141,7 +142,8 @@ def hparams(self, defaults, model_hparams):
     p.target_space_id = problem.SpaceID.REAL
 
   def example_reading_spec(self):
-    # TODO(rsepassi): propagate and apply targets_mask to output RealModality.
+    # TODO(rsepassi): propagate and apply targets_mask to output RealModality
+    # and to eval metrics (weights_fn?).
     data_fields = {
         "inputs": tf.VarLenFeature(tf.int64),
         "targets_mask": tf.VarLenFeature(tf.float32),
@@ -158,6 +160,9 @@ def preprocess_examples(self, examples, mode):
 
     return examples
 
+  def eval_metrics(self):
+    return [metrics.Metrics.RMSE]
+
 
 @registry.register_problem("genetics_cage10")
 class GeneticsCAGE10(GeneExpressionProblem):
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 22b6214e6..69d81e58e 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -22,6 +22,7 @@
 
 from tensor2tensor.data_generators import generator_utils as utils
 from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.utils import metrics
 
 import tensorflow as tf
 
@@ -111,6 +112,17 @@ class Problem(object):
     * hparams(defaults, model_hparams)
         - Specify the problem hyperparameters (see _default_hparams)
         - Mutate defaults as needed
+    * example_reading_spec
+        - Specify the names and types of the features on disk.
+        - Specify tf.contrib.slim.tfexample_decoder
+    * preprocess_examples(examples, mode)
+        - Preprocess the example feature dict from feature name to Tensor or
+          SparseTensor.
+        - Used in training, eval, and inference (specified by mode).
+
+  Eval:
+    * eval_metrics
+        - Specify the set of evaluation metrics for this problem.
 
   Inference:
     * feature_encoders(data_dir)
@@ -151,6 +163,12 @@ def preprocess_examples(self, examples, mode):
     del mode
     return examples
 
+  def eval_metrics(self):
+    return [
+        metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5,
+        metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY
+    ]
+
   # ============================================================================
   # END SUBCLASS INTERFACE
   # ============================================================================
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index 50125ccd1..519d55996 100644
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -28,6 +28,7 @@
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators import wsj_parsing
+from tensor2tensor.utils import metrics
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
@@ -120,6 +121,13 @@ def hparams(self, defaults, unused_model_hparams):
     if self.is_character_level:
       p.loss_multiplier = 2.0
 
+  def eval_metrics(self):
+    return [
+        metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5,
+        metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY,
+        metrics.Metrics.APPROX_BLEU
+    ]
+
 
 # Generic generators used later for multiple problems.
 
@@ -658,6 +666,13 @@ def input_space_id(self):
   def target_space_id(self):
     return problem.SpaceID.CS_TOK
 
+  def eval_metrics(self):
+    return [
+        metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5,
+        metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY,
+        metrics.Metrics.APPROX_BLEU
+    ]
+
 
 @registry.register_problem("wmt_encs_characters")
 class WMTEnCsCharacters(WMTProblem):
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index 118e33394..29f44b574 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -18,8 +18,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
 # Dependency imports
 
 import six
@@ -29,7 +27,24 @@
 
 import tensorflow as tf
 
-FLAGS = tf.flags.FLAGS
+
+class Metrics(object):
+  """Available evaluation metrics."""
+  # Entries here should match the keys in METRICS_FN below
+  ACC = "accuracy"
+  ACC_TOP5 = "accuracy_top5"
+  ACC_PER_SEQ = "accuracy_per_sequence"
+  NEG_LOG_PERPLEXITY = "neg_log_perplexity"
+  APPROX_BLEU = "approx_bleu_score"
+  RMSE = "rmse"
+
+
+def padded_rmse(predictions, labels, weights_fn=common_layers.weights_all):
+  predictions, labels = common_layers.pad_with_zeros(predictions, labels)
+  targets = labels
+  weights = weights_fn(targets)
+  error = tf.sqrt(tf.pow(predictions - labels, 2))
+  return tf.reduce_sum(error * weights), tf.reduce_sum(weights)
 
 
 def padded_accuracy_topk(predictions,
@@ -98,62 +113,76 @@ def create_evaluation_metrics(problems):
   """Creates the evaluation metrics for the model.
 
   Args:
-    problems: List of strings containing the name of the problems.
+    problems: List of tuples (problem name, problem instance).
 
   Returns:
     A dictionary with keys that are strings naming the evaluation
     metrics and values that are functions taking arguments of
     (predictions, targets), returning a tuple of a tensor of the
     metric's value together with an op to update the metric's value.
+
+  Raises:
+    ValueError: if the metrics specified by a problem are not recognized (i.e.
+      are not defined in the Metrics enum.
   """
 
-  def append_metric_fns(metric_tup, eval_metrics):
-    """Append problem-specific and global metrics to eval_metrics."""
-    metric_name, metric_function = metric_tup
-    def fn(predictions, labels, weights, idx, weights_fn):
-      # The 'weights' argument represents problem-choice here,
-      # we need to keep this name because MetricSpecs checks it.
+  def make_problem_specific_metric_fn(metric_fn, problem_idx, weights_fn):
+    """Create a metric fn conditioned on problem_idx."""
+
+    def problem_metric_fn(predictions, labels, weights):
       problem_choice = weights
       (scores, weights) = tf.cond(
-          tf.equal(idx, problem_choice),  # pylint: disable=cell-var-from-loop
-          lambda: metric_function(predictions, labels, weights_fn=weights_fn),
+          tf.equal(problem_idx, problem_choice),
+          lambda: metric_fn(predictions, labels, weights_fn=weights_fn),
           lambda: (tf.constant(0.0), tf.constant(0.0)))
       # The tf.metrics.mean function assures correct aggregation.
       return tf.metrics.mean(scores, weights)
 
-    for i, problem in enumerate(problems):
-      name = "metrics-%s/%s" % (problem, metric_name)
-      class_output = "image" in problem and "coco" not in problem
-      weights_fn = (common_layers.weights_all if class_output
-                    else common_layers.weights_nonzero)
-      eval_metrics[name] = functools.partial(fn, idx=i, weights_fn=weights_fn)
-
-    def global_fn(predictions, labels, weights):
-      (scores, weights) = metric_function(predictions, labels)
-      return tf.metrics.mean(scores, weights)
-
-    eval_metrics["metrics/%s" % metric_name] = global_fn
+    return problem_metric_fn
 
   eval_metrics = dict()
-
-  # Metrics are functions that take predictions and labels and return
-  # a tensor of metrics and a tensor of weights.
-  # The results are passed to tf.metrics.mean to accumulate properly.
-  metrics_list = [("accuracy", padded_accuracy), ("accuracy_top5",
-                                                  padded_accuracy_top5),
-                  ("accuracy_per_sequence", padded_sequence_accuracy),
-                  ("neg_log_perplexity", padded_neg_log_perplexity)]
-
-  # TODO(nikip): Extend this to support use of custom metrics for problems.
-  for problem in problems:
-    if "wmt" in problem:
-      metrics_list.append(("approx_bleu_score", bleu_hook.bleu_score))
-
-  for metric in metrics_list:
-    append_metric_fns(metric, eval_metrics)
+  for problem_idx, (problem_name, problem_instance) in enumerate(problems):
+    if problem_instance is None:
+      # For problems in problem_hparams
+      metrics = [
+          Metrics.ACC, Metrics.ACC_TOP5, Metrics.ACC_PER_SEQ,
+          Metrics.NEG_LOG_PERPLEXITY
+      ]
+      if "wmt" in problem_name:
+        metrics.append(Metrics.APPROX_BLEU)
+    else:
+      # For registered Problems
+      metrics = problem_instance.eval_metrics()
+      if not all([m in METRICS_FNS for m in metrics]):
+        raise ValueError("Unrecognized metric. Problem %s specified metrics "
+                         "%s. Recognized metrics are %s." %
+                         (problem_name, metrics, METRICS_FNS.keys()))
+
+    class_output = "image" in problem_name and "coco" not in problem_name
+    weights_fn = (common_layers.weights_all
+                  if class_output else common_layers.weights_nonzero)
+
+    for metric in metrics:
+      metric_fn = METRICS_FNS[metric]
+      problem_metric_fn = make_problem_specific_metric_fn(
+          metric_fn, problem_idx, weights_fn)
+      eval_metrics["metrics-%s/%s" % (problem_name, metric)] = problem_metric_fn
 
   return {
       k: tf.contrib.learn.MetricSpec(
           v, prediction_key="predictions", weight_key="problem_choice")
       for (k, v) in six.iteritems(eval_metrics)
   }
+
+
+# Metrics are functions that take predictions and labels and return
+# a tensor of metrics and a tensor of weights.
+# The results are passed to tf.metrics.mean to accumulate properly.
+METRICS_FNS = {
+    Metrics.ACC: padded_accuracy,
+    Metrics.ACC_TOP5: padded_accuracy_top5,
+    Metrics.ACC_PER_SEQ: padded_sequence_accuracy,
+    Metrics.NEG_LOG_PERPLEXITY: padded_neg_log_perplexity,
+    Metrics.APPROX_BLEU: bleu_hook.bleu_score,
+    Metrics.RMSE: padded_rmse,
+}
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 9777568fc..66e40d495 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -164,6 +164,8 @@ def infer(self,
     Returns:
        samples: an integer `Tensor`.
     """
+    # TODO(rsepassi): Make decoding work with real-valued model outputs
+    # (i.e. if the target modality is RealModality).
     if not self.has_input:
       # since there is no input, it is more interesting to see randomly
       # generated sequences, than to see the most likely sequence repeatedly.
@@ -500,5 +502,5 @@ def _warn_changed_modality_type(new_name, old_name, feature_name):
   old_type, old_name = registry.parse_modality_name(old_name)
   if new_type != old_type:
     tf.logging.warning("%s has a designated modality type %s (%s) but has been "
-                       "overriden with a modality of type %s (%s).",
+                       "overridden with a modality of type %s (%s).",
                        feature_name, old_type, old_name, new_type, new_name)
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 0943881f3..bf42c36cc 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -85,7 +85,7 @@
 flags.DEFINE_integer("local_eval_frequency", 2000,
                      "Run evaluation every this steps during local training.")
 flags.DEFINE_bool("locally_shard_to_cpu", False,
-                  "Use CPU as a sharding device runnning locally. This allows "
+                  "Use CPU as a sharding device running locally. This allows "
                   "to test sharded model construction on a machine with 1 GPU.")
 flags.DEFINE_bool("daisy_chain_variables", True,
                   "copy variables around in a daisy chain")
@@ -103,6 +103,9 @@
 flags.DEFINE_integer("ps_replicas", 0, "How many ps replicas.")
 
 # Decode flags
+# Set one of {decode_from_dataset, decode_interactive, decode_from_file} to
+# decode.
+flags.DEFINE_bool("decode_from_dataset", False, "Decode from dataset on disk.")
 flags.DEFINE_bool("decode_use_last_position_only", False,
                   "In inference, use last position only for speedup.")
 flags.DEFINE_bool("decode_interactive", False,
@@ -152,17 +155,24 @@ def experiment_fn(output_dir):
 
 def create_experiment(output_dir, data_dir, model_name, train_steps,
                       eval_steps):
+  """Create Experiment."""
   hparams = create_hparams(FLAGS.hparams_set, data_dir)
   estimator, input_fns = create_experiment_components(
       hparams=hparams,
       output_dir=output_dir,
       data_dir=data_dir,
       model_name=model_name)
+  eval_metrics = metrics.create_evaluation_metrics(
+      zip(FLAGS.problems.split("-"), hparams.problem_instances))
+  if ("autotune" in FLAGS and FLAGS.autotune and
+      FLAGS.objective not in eval_metrics):
+    raise ValueError("Tuning objective %s not among evaluation metrics %s" %
+                     (FLAGS.objective, eval_metrics.keys()))
   return tf.contrib.learn.Experiment(
       estimator=estimator,
       train_input_fn=input_fns["train"],
       eval_input_fn=input_fns["eval"],
-      eval_metrics=metrics.create_evaluation_metrics(FLAGS.problems.split("-")),
+      eval_metrics=eval_metrics,
       train_steps=train_steps,
       eval_steps=eval_steps,
       min_eval_frequency=FLAGS.local_eval_frequency,
@@ -585,18 +595,18 @@ def run_locally(exp):
   Args:
     exp: Experiment.
   """
-  if exp.train_steps > 0:
-    # Train
-    tf.logging.info("Performing local training.")
+  if exp.train_steps > 0 or exp.eval_steps > 0:
+    tf.logging.info("Performing local training and evaluation.")
     exp.train_and_evaluate()
+  decode(exp.estimator)
 
-  # Predict
-  estimator = exp.estimator
+
+def decode(estimator):
   if FLAGS.decode_interactive:
     decode_interactively(estimator)
   elif FLAGS.decode_from_file is not None:
     decode_from_file(estimator, FLAGS.decode_from_file)
-  else:
+  elif FLAGS.decode_from_dataset:
     decode_from_dataset(estimator)
 
 

From ca08ad9bf1ec957646a17eda089d3b530fb77d93 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Tue, 25 Jul 2017 17:26:22 -0700
Subject: [PATCH 12/21] Un-reorder of timing signals to make trained models
 work.

PiperOrigin-RevId: 163147659
---
 tensor2tensor/models/common_attention.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py
index 98a198f85..1a8b2c79d 100644
--- a/tensor2tensor/models/common_attention.py
+++ b/tensor2tensor/models/common_attention.py
@@ -66,9 +66,6 @@ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
       tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
   scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
   signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
-  signal = tf.reshape(signal, [length, 2, num_timescales])
-  signal = tf.transpose(signal, perm=[0, 2, 1])
-  signal = tf.reshape(signal, [length, channels])
   signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
   signal = tf.reshape(signal, [1, length, channels])
   return x + signal

From b9fcd66f14ecded01cc257932655f5b1f493e3b9 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Tue, 25 Jul 2017 18:12:16 -0700
Subject: [PATCH 13/21] Back to wmt16 on one set not downloadable from wmt17,
 internal merges.

PiperOrigin-RevId: 163152415
---
 tensor2tensor/bin/t2t-datagen                 | 1 +
 tensor2tensor/data_generators/text_encoder.py | 9 +++++----
 tensor2tensor/data_generators/wmt.py          | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index a9fa12255..629014713 100644
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -281,6 +281,7 @@ def generate_data_for_problem(problem):
 
 
 def generate_data_for_registered_problem(problem_name):
+  tf.logging.info("Generating training data for %s.", problem_name)
   problem = registry.problem(problem_name)
   task_id = None if FLAGS.task_id < 0 else FLAGS.task_id
   problem.generate_data(os.path.expanduser(FLAGS.data_dir),
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 6b01e3a35..9fc9eed88 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -30,6 +30,7 @@
 # Dependency imports
 
 import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensor2tensor.data_generators import tokenizer
 
 import tensorflow as tf
@@ -457,7 +458,7 @@ def build_from_token_counts(self,
     # with high enough counts for our new vocabulary.
     if min_count < 1:
       min_count = 1
-    for i in six.moves.range(num_iterations):
+    for i in xrange(num_iterations):
       tf.logging.info("Iteration {0}".format(i))
       counts = collections.defaultdict(int)
       for token, count in six.iteritems(token_counts):
@@ -474,7 +475,7 @@ def build_from_token_counts(self,
             starts.append(pos)
             pos += len(self._all_subtoken_strings[subtoken])
         for start in starts:
-          for end in six.moves.range(start + 1, len(escaped_token) + 1):
+          for end in xrange(start + 1, len(escaped_token) + 1):
             subtoken_string = escaped_token[start:end]
             counts[subtoken_string] += count
       # Array of sets of candidate subtoken strings, by length
@@ -491,7 +492,7 @@ def build_from_token_counts(self,
       new_subtoken_strings = []
       # Consider the candidates longest to shortest, so that if we accept
       # a longer subtoken string, we can decrement the counts of its prefixes.
-      for lsub in six.moves.range(len(len_to_subtoken_strings)-1, 0, -1):
+      for lsub in xrange(len(len_to_subtoken_strings)-1, 0, -1):
         subtoken_strings = len_to_subtoken_strings[lsub]
         for subtoken_string in subtoken_strings:
           count = counts[subtoken_string]
@@ -500,7 +501,7 @@ def build_from_token_counts(self,
             # explicitly, regardless of count.
             if subtoken_string not in self._alphabet:
               new_subtoken_strings.append((count, subtoken_string))
-            for l in six.moves.range(1, lsub):
+            for l in xrange(1, lsub):
               counts[subtoken_string[:l]] -= count
       new_subtoken_strings.sort(reverse=True)
 
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index 519d55996..9587d4d2a 100644
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -259,7 +259,7 @@ def bi_vocabs_token_generator(source_path,
 
 _ENDE_TRAIN_DATASETS = [
     [
-        "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
+        "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
         ("training-parallel-nc-v11/news-commentary-v11.de-en.en",
          "training-parallel-nc-v11/news-commentary-v11.de-en.de")
     ],

From 92101af0f2fbc4e16557fd688bde9cd9cc33a452 Mon Sep 17 00:00:00 2001
From: Niki Parmar <nikip@google.com>
Date: Wed, 26 Jul 2017 09:58:41 -0700
Subject: [PATCH 14/21] Bug fix, specify axis for squeeze when computing BLEU
 score

PiperOrigin-RevId: 163219501
---
 tensor2tensor/utils/bleu_hook.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py
index 06d62ad1e..20a7c8426 100644
--- a/tensor2tensor/utils/bleu_hook.py
+++ b/tensor2tensor/utils/bleu_hook.py
@@ -92,7 +92,6 @@ def compute_bleu(reference_corpus,
       matches_by_order[len(ngram) - 1] += overlap[ngram]
     for ngram in translation_ngram_counts:
       possible_matches_by_order[len(ngram)-1] += translation_ngram_counts[ngram]
-
   precisions = [0] * max_order
   for i in xrange(0, max_order):
     if possible_matches_by_order[i] > 0:
@@ -107,7 +106,6 @@ def compute_bleu(reference_corpus,
   if use_bp:
     ratio = translation_length / reference_length
     bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
-
   bleu = geo_mean * bp
   return np.float32(bleu)
 
@@ -128,8 +126,8 @@ def bleu_score(predictions, labels, **unused_kwargs):
   """
   outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
   # Convert the outputs and labels to a [batch_size, input_length] tensor.
-  outputs = tf.squeeze(outputs)
-  labels = tf.squeeze(labels)
+  outputs = tf.squeeze(outputs, axis=[-1, -2])
+  labels = tf.squeeze(labels, axis=[-1, -2])
 
   bleu = tf.py_func(compute_bleu, (labels, outputs), tf.float32)
   return bleu, tf.constant(1.0)

From 28eb48f9d1799fbe83ae54c88c02fa4301f97120 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 26 Jul 2017 12:42:01 -0700
Subject: [PATCH 15/21] Limit number of concurrent processes in
 GeneExpressionProblem

PiperOrigin-RevId: 163241281
---
 tensor2tensor/data_generators/genetics.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py
index 309580d53..88b82cb49 100644
--- a/tensor2tensor/data_generators/genetics.py
+++ b/tensor2tensor/data_generators/genetics.py
@@ -36,6 +36,7 @@
 from __future__ import print_function
 
 import itertools
+import math
 import multiprocessing as mp
 import os
 
@@ -54,6 +55,7 @@
 
 import tensorflow as tf
 
+MAX_CONCURRENT_PROCESSES = 10
 _bases = list("ACTG")
 
 
@@ -122,12 +124,19 @@ def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
                   start_idx, end_idx))
         processes.append(p)
 
-    # Start and wait for processes
+    # Start and wait for processes in batches
     assert len(processes) == num_shards + 2  # 1 per training shard + dev + test
-    for p in processes:
-      p.start()
-    for p in processes:
-      p.join()
+
+    num_batches = int(
+        math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES))
+    for i in xrange(num_batches):
+      start = i * MAX_CONCURRENT_PROCESSES
+      end = start + MAX_CONCURRENT_PROCESSES
+      current = processes[start:end]
+      for p in current:
+        p.start()
+      for p in current:
+        p.join()
 
     # Shuffle
     generator_utils.shuffle_dataset(all_filepaths)

From cff9f4367095e62b415637d6fb839db7bdc8a28d Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Wed, 26 Jul 2017 13:52:47 -0700
Subject: [PATCH 16/21] Allow building a subword vocab from a word vocab file
 and add tests.

PiperOrigin-RevId: 163250427
---
 .../data_generators/test_data/corpus-1.txt    |   4 +
 .../data_generators/test_data/corpus-2.txt    |   3 +
 .../data_generators/test_data/vocab-1.txt     |   2 +
 .../data_generators/test_data/vocab-2.txt     |   3 +
 tensor2tensor/data_generators/text_encoder.py | 123 ++++++++--------
 .../text_encoder_build_subword.py             |  36 +++--
 .../data_generators/text_encoder_test.py      | 107 ++++++++++++--
 tensor2tensor/data_generators/tokenizer.py    | 124 +++++++++++------
 .../data_generators/tokenizer_test.py         | 131 +++++++++++++++---
 9 files changed, 387 insertions(+), 146 deletions(-)
 create mode 100644 tensor2tensor/data_generators/test_data/corpus-1.txt
 create mode 100644 tensor2tensor/data_generators/test_data/corpus-2.txt
 create mode 100644 tensor2tensor/data_generators/test_data/vocab-1.txt
 create mode 100644 tensor2tensor/data_generators/test_data/vocab-2.txt

diff --git a/tensor2tensor/data_generators/test_data/corpus-1.txt b/tensor2tensor/data_generators/test_data/corpus-1.txt
new file mode 100644
index 000000000..c05e47f90
--- /dev/null
+++ b/tensor2tensor/data_generators/test_data/corpus-1.txt
@@ -0,0 +1,4 @@
+One morning I shot an elephant in my pajamas. How he got in my pajamas, I don't
+know.
+
+Groucho Marx
diff --git a/tensor2tensor/data_generators/test_data/corpus-2.txt b/tensor2tensor/data_generators/test_data/corpus-2.txt
new file mode 100644
index 000000000..f45577c4b
--- /dev/null
+++ b/tensor2tensor/data_generators/test_data/corpus-2.txt
@@ -0,0 +1,3 @@
+I haven't slept for 10 days... because that would be too long.
+
+Mitch Hedberg
diff --git a/tensor2tensor/data_generators/test_data/vocab-1.txt b/tensor2tensor/data_generators/test_data/vocab-1.txt
new file mode 100644
index 000000000..d34d3d957
--- /dev/null
+++ b/tensor2tensor/data_generators/test_data/vocab-1.txt
@@ -0,0 +1,2 @@
+lollipop,8
+reverberated,12
diff --git a/tensor2tensor/data_generators/test_data/vocab-2.txt b/tensor2tensor/data_generators/test_data/vocab-2.txt
new file mode 100644
index 000000000..7793af4f6
--- /dev/null
+++ b/tensor2tensor/data_generators/test_data/vocab-2.txt
@@ -0,0 +1,3 @@
+kattywampus,11
+balderdash,10
+jiggery-pokery,14
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 9fc9eed88..69d29779a 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -30,11 +30,11 @@
 # Dependency imports
 
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensor2tensor.data_generators import tokenizer
 
 import tensorflow as tf
 
+xrange = six.moves.xrange  # pylint: disable=redefined-builtin
 
 # Reserved tokens for things like padding and EOS symbols.
 PAD = "<pad>"
@@ -295,7 +295,7 @@ def encode(self, raw_text):
     Returns:
       a list of integers in the range [0, vocab_size)
     """
-    return self._tokens_to_subtokens(tokenizer.encode(
+    return self._tokens_to_subtoken_ids(tokenizer.encode(
         native_to_unicode(raw_text)))
 
   def decode(self, subtokens):
@@ -307,14 +307,14 @@ def decode(self, subtokens):
       a native string
     """
     return unicode_to_native(tokenizer.decode(
-        self._subtokens_to_tokens(subtokens)))
+        self._subtoken_ids_to_tokens(subtokens)))
 
   @property
   def vocab_size(self):
     """The subtoken vocabulary size."""
     return len(self._all_subtoken_strings)
 
-  def _tokens_to_subtokens(self, tokens):
+  def _tokens_to_subtoken_ids(self, tokens):
     """Converts a list of tokens to a list of subtoken ids.
 
     Args:
@@ -324,11 +324,11 @@ def _tokens_to_subtokens(self, tokens):
     """
     ret = []
     for token in tokens:
-      ret.extend(self._escaped_token_to_subtokens(
+      ret.extend(self._escaped_token_to_subtoken_ids(
           _escape_token(token, self._alphabet)))
     return ret
 
-  def _subtokens_to_tokens(self, subtokens):
+  def _subtoken_ids_to_tokens(self, subtokens):
     """Converts a list of subtoken ids to a list of tokens.
 
     Args:
@@ -337,45 +337,58 @@ def _subtokens_to_tokens(self, subtokens):
       a list of strings.
     """
     concatenated = "".join(
-        [self._subtoken_to_subtoken_string(s) for s in subtokens])
+        [self._subtoken_id_to_subtoken_string(s) for s in subtokens])
     split = concatenated.split("_")
     return [_unescape_token(t + "_") for t in split if t]
 
-  def _subtoken_to_subtoken_string(self, subtoken):
-    """Subtoken_String (string) corresponding to the given subtoken (id)."""
+  def _subtoken_id_to_subtoken_string(self, subtoken):
+    """Converts a subtoken integer ID to a subtoken string."""
     if 0 <= subtoken < self.vocab_size:
       return self._all_subtoken_strings[subtoken]
     return u""
 
-  def _escaped_token_to_subtokens(self, escaped_token):
-    """Converts an escaped token string to a list of subtokens.
+  def _escaped_token_to_subtoken_strings(self, escaped_token):
+    """Converts an escaped token string to a list of subtoken strings.
 
     Args:
-      escaped_token: an escaped token
+      escaped_token: An escaped token as a unicode string.
     Returns:
-      a list of one or more integers.
+      A list of subtokens as unicode strings.
     """
+    # NOTE: This algorithm is greedy; it won't necessarily produce the "best"
+    # list of subtokens.
     ret = []
-    pos = 0
-    lesc = len(escaped_token)
-    while pos < lesc:
-      end = min(lesc, pos + self._max_subtoken_len)
-      while end > pos:
-        subtoken_id = self._subtoken_string_to_id.get(escaped_token[pos:end])
-        if subtoken_id is not None:
+    start = 0
+    token_len = len(escaped_token)
+    while start < token_len:
+      for end in xrange(
+          min(token_len, start + self._max_subtoken_len), start, -1):
+        subtoken = escaped_token[start:end]
+        if subtoken in self._subtoken_string_to_id:
+          ret.append(subtoken)
+          start = end
           break
-        end -= 1
 
-      # If there is no possible encoding of the escaped token then one of the
-      # characters in the token is not in the alphabet. This should be
-      # impossible and would be indicative of a bug.
-      assert subtoken_id is not None
-
-      ret.append(subtoken_id)
-      pos = end
+      else:  # Did not break
+        # If there is no possible encoding of the escaped token then one of the
+        # characters in the token is not in the alphabet. This should be
+        # impossible and would be indicative of a bug.
+        assert False, "Token substring not found in subtoken vocabulary."
 
     return ret
 
+  def _escaped_token_to_subtoken_ids(self, escaped_token):
+    """Converts an escaped token string to a list of subtoken IDs.
+
+    Args:
+      escaped_token: An escaped token as a unicode string.
+    Returns:
+      A list of subtoken IDs as integers.
+    """
+    return [
+        self._subtoken_string_to_id[subtoken]
+        for subtoken in self._escaped_token_to_subtoken_strings(escaped_token)]
+
   @classmethod
   def build_to_target_size(cls,
                            target_size,
@@ -460,55 +473,51 @@ def build_from_token_counts(self,
       min_count = 1
     for i in xrange(num_iterations):
       tf.logging.info("Iteration {0}".format(i))
-      counts = collections.defaultdict(int)
+
+      # Collect all substrings of the encoded token that break along current
+      # subtoken boundaries.
+      subtoken_counts = collections.defaultdict(int)
       for token, count in six.iteritems(token_counts):
         escaped_token = _escape_token(token, self._alphabet)
-        # we will count all tails of the escaped_token, starting from boundaries
-        # determined by our current segmentation.
-        if i == 0:
-          starts = six.moves.range(len(escaped_token))
-        else:
-          subtokens = self._escaped_token_to_subtokens(escaped_token)
-          pos = 0
-          starts = []
-          for subtoken in subtokens:
-            starts.append(pos)
-            pos += len(self._all_subtoken_strings[subtoken])
-        for start in starts:
+        subtokens = self._escaped_token_to_subtoken_strings(escaped_token)
+        start = 0
+        for subtoken in subtokens:
           for end in xrange(start + 1, len(escaped_token) + 1):
-            subtoken_string = escaped_token[start:end]
-            counts[subtoken_string] += count
-      # Array of sets of candidate subtoken strings, by length
+            new_subtoken = escaped_token[start:end]
+            subtoken_counts[new_subtoken] += count
+          start += len(subtoken)
+
+      # Array of sets of candidate subtoken strings, by length.
       len_to_subtoken_strings = []
-      for subtoken_string, count in six.iteritems(counts):
+      for subtoken_string, count in six.iteritems(subtoken_counts):
         lsub = len(subtoken_string)
-        # Always include all the alphabet characters or some strings will
-        # be unencodeable.
-        if count >= min_count or subtoken_string in self._alphabet:
-          # Add this subtoken string to its length set
+        if count >= min_count:
           while len(len_to_subtoken_strings) <= lsub:
             len_to_subtoken_strings.append(set())
           len_to_subtoken_strings[lsub].add(subtoken_string)
-      new_subtoken_strings = []
+
       # Consider the candidates longest to shortest, so that if we accept
       # a longer subtoken string, we can decrement the counts of its prefixes.
+      new_subtoken_strings = []
       for lsub in xrange(len(len_to_subtoken_strings)-1, 0, -1):
         subtoken_strings = len_to_subtoken_strings[lsub]
         for subtoken_string in subtoken_strings:
-          count = counts[subtoken_string]
-          if count >= min_count or subtoken_string in self._alphabet:
-            # Exclude alphabet tokens here, as they must be included later
+          count = subtoken_counts[subtoken_string]
+          if count >= min_count:
+            # Exclude alphabet tokens here, as they must be included later,
             # explicitly, regardless of count.
             if subtoken_string not in self._alphabet:
               new_subtoken_strings.append((count, subtoken_string))
             for l in xrange(1, lsub):
-              counts[subtoken_string[:l]] -= count
+              subtoken_counts[subtoken_string[:l]] -= count
+
+      # Include the alphabet explicitly to guarantee all strings are encodable.
+      new_subtoken_strings.extend(
+          (subtoken_counts.get(a, 0), a) for a in self._alphabet)
       new_subtoken_strings.sort(reverse=True)
 
-      # Reinitialize to the candidate vocabulary, including the alphabet
-      # explicitly as the highest priority.
+      # Reinitialize to the candidate vocabulary.
       self._init_subtokens_from_list(
-          list(self._alphabet) +
           [subtoken for _, subtoken in new_subtoken_strings],
           reserved=num_reserved_ids)
       tf.logging.info("vocab_size = %d" % self.vocab_size)
diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py
index a0d5d8937..88dfac116 100644
--- a/tensor2tensor/data_generators/text_encoder_build_subword.py
+++ b/tensor2tensor/data_generators/text_encoder_build_subword.py
@@ -39,10 +39,13 @@
 
 import tensorflow as tf
 
-tf.app.flags.DEFINE_string('output_fn', '/tmp/my.subword_text_encoder',
+tf.app.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder',
                            'where to store the SubwordTextEncoder')
 tf.app.flags.DEFINE_string('corpus_filepattern', '',
                            'Corpus of one or more text files')
+tf.app.flags.DEFINE_string('vocab_filepattern', '',
+                           'One or more vocabulary files '
+                           '(one word per line as "word,count")')
 tf.app.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus')
 tf.app.flags.DEFINE_integer('corpus_max_lines', 10000,
                             'How many lines of corpus to read')
@@ -52,16 +55,27 @@
 
 
 def main(unused_argv):
-  gs = text_encoder.SubwordTextEncoder()
-  if not FLAGS.corpus_filepattern:
-    raise ValueError('Must provide --corpus_filepattern')
-  token_counts = tokenizer.corpus_token_counts(
-      FLAGS.corpus_filepattern, FLAGS.corpus_max_lines,
-      split_on_newlines=FLAGS.split_on_newlines)
-  gs.build_from_token_counts(token_counts,
-                             FLAGS.min_count,
-                             FLAGS.num_iterations)
-  gs.store_to_file(FLAGS.output_fn)
+  if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
+    raise ValueError(
+        'Must only provide one of --corpus_filepattern or --vocab_filepattern')
+
+  elif FLAGS.corpus_filepattern:
+    token_counts = tokenizer.corpus_token_counts(
+        FLAGS.corpus_filepattern, FLAGS.corpus_max_lines,
+        split_on_newlines=FLAGS.split_on_newlines)
+
+  elif FLAGS.vocab_filepattern:
+    token_counts = tokenizer.vocab_token_counts(
+        FLAGS.vocab_filepattern, FLAGS.corpus_max_lines)
+
+  else:
+    raise ValueError(
+        'Must provide one of --corpus_filepattern or --vocab_filepattern')
+
+  encoder = text_encoder.SubwordTextEncoder()
+  encoder.build_from_token_counts(
+      token_counts, FLAGS.min_count, FLAGS.num_iterations)
+  encoder.store_to_file(FLAGS.output_fn)
 
 
 if __name__ == '__main__':
diff --git a/tensor2tensor/data_generators/text_encoder_test.py b/tensor2tensor/data_generators/text_encoder_test.py
index 7ac2ba911..4142f8699 100644
--- a/tensor2tensor/data_generators/text_encoder_test.py
+++ b/tensor2tensor/data_generators/text_encoder_test.py
@@ -18,8 +18,12 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+
+import collections
 
 # Dependency imports
+import mock
 
 from tensor2tensor.data_generators import text_encoder
 import tensorflow as tf
@@ -29,40 +33,113 @@ class EscapeUnescapeTokenTest(tf.test.TestCase):
 
   def test_escape_token(self):
     escaped = text_encoder._escape_token(
-        u'Foo! Bar.\nunder_score back\\slash',
+        'Foo! Bar.\nunder_score back\\slash',
         set('abcdefghijklmnopqrstuvwxyz .\n') | text_encoder._ESCAPE_CHARS)
 
     self.assertEqual(
-        u'\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_', escaped)
+        '\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_', escaped)
 
   def test_unescape_token(self):
     unescaped = text_encoder._unescape_token(
-        u'\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_')
+        '\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_')
 
     self.assertEqual(
-        u'Foo! Bar.\nunder_score back\\slash', unescaped)
+        'Foo! Bar.\nunder_score back\\slash', unescaped)
 
 
 class SubwordTextEncoderTest(tf.test.TestCase):
 
   def test_encode_decode(self):
-    token_counts = {
-        u'this': 9,
-        u'sentence': 14,
-        u'the': 100,
-        u'encoded': 1,
-        u'was': 20,
-        u'by': 50,
-    }
+    corpus = (
+        'This is a corpus of text that provides a bunch of tokens from which '
+        'to build a vocabulary. It will be used when strings are encoded '
+        'with a TextEncoder subclass. The encoder was coded by a coder.')
+    token_counts = collections.Counter(corpus.split(' '))
+    alphabet = set(corpus) ^ {' '}
+
+    original = 'This is a coded sentence encoded by the SubwordTextEncoder.'
+    token_counts.update(original.split(' '))
+
     encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
-        50, token_counts, 2, 10)
-    encoder.build_from_token_counts(token_counts, min_count=2)
+        100, token_counts, 2, 10)
 
-    original = 'This sentence was encoded by the SubwordTextEncoder.'
+    # Encoding should be reversible.
     encoded = encoder.encode(original)
     decoded = encoder.decode(encoded)
     self.assertEqual(original, decoded)
 
+    # The substrings coded and coder are frequent enough in the corpus that
+    # they should appear in the vocabulary even though they are substrings
+    # of other included strings.
+    subtoken_strings = {encoder._all_subtoken_strings[i] for i in encoded}
+    self.assertIn('encoded_', subtoken_strings)
+    self.assertIn('coded_', subtoken_strings)
+    self.assertIn('TextEncoder', encoder._all_subtoken_strings)
+    self.assertIn('coder', encoder._all_subtoken_strings)
+
+    # Every character in the corpus should be in the encoder's alphabet and
+    # its subtoken vocabulary.
+    self.assertTrue(alphabet.issubset(encoder._alphabet))
+    for a in alphabet:
+      self.assertIn(a, encoder._all_subtoken_strings)
+
+  def test_unicode(self):
+    corpus = 'Cat emoticons. \U0001F638 \U0001F639 \U0001F63A \U0001F63B'
+    token_counts = collections.Counter(corpus.split(' '))
+
+    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
+        100, token_counts, 2, 10)
+
+    self.assertIn('\U0001F638', encoder._alphabet)
+    self.assertIn('\U0001F63B', encoder._all_subtoken_strings)
+
+  def test_small_vocab(self):
+    corpus = 'The quick brown fox jumps over the lazy dog'
+    token_counts = collections.Counter(corpus.split(' '))
+    alphabet = set(corpus) ^ {' '}
+
+    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
+        10, token_counts, 2, 10)
+
+    # All vocabulary elements are in the alphabet and subtoken strings even
+    # if we requested a smaller vocabulary to assure all expected strings
+    # are encodable.
+    self.assertTrue(alphabet.issubset(encoder._alphabet))
+    for a in alphabet:
+      self.assertIn(a, encoder._all_subtoken_strings)
+
+  def test_encodable_when_not_in_alphabet(self):
+    corpus = 'the quick brown fox jumps over the lazy dog'
+    token_counts = collections.Counter(corpus.split(' '))
+
+    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
+        100, token_counts, 2, 10)
+    original = 'This has UPPER CASE letters that are out of alphabet'
+
+    # Early versions could have an infinite loop when breaking into subtokens
+    # if there was any out-of-alphabet characters in the encoded string.
+    encoded = encoder.encode(original)
+    decoded = encoder.decode(encoded)
+
+    self.assertEqual(original, decoded)
+    encoded_str = ''.join(encoder._all_subtoken_strings[i] for i in encoded)
+    self.assertIn('\\84;', encoded_str)
+
+  @mock.patch.object(text_encoder, '_ESCAPE_CHARS', new=set('\\_;13579'))
+  def test_raises_exception_when_not_encodable(self):
+    corpus = 'the quick brown fox jumps over the lazy dog'
+    token_counts = collections.Counter(corpus.split(' '))
+
+    # Deliberately exclude some required encoding chars from the alphabet
+    # and token list, making some strings unencodable.
+    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
+        100, token_counts, 2, 10)
+    original = 'This has UPPER CASE letters that are out of alphabet'
+
+    # Previously there was a bug which produced an infinite loop in this case.
+    with self.assertRaises(AssertionError):
+      encoder.encode(original)
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
index 0f4141199..1acffc04c 100644
--- a/tensor2tensor/data_generators/tokenizer.py
+++ b/tensor2tensor/data_generators/tokenizer.py
@@ -30,7 +30,7 @@
     alphanumeric character and a non-alphanumeric character.  This produces
     a list which alternates between "alphanumeric tokens"
     (strings of alphanumeric characters) and "non-alphanumeric tokens"
-    (strings of of non-alphanumeric characters).
+    (strings of non-alphanumeric characters).
 
 2.  Remove every token consisting of a single space, unless it is
     the very first or very last token in the list.  These tokens are now
@@ -44,28 +44,26 @@
 from __future__ import division
 from __future__ import print_function
 
-from collections import defaultdict
+import collections
 import sys
 import unicodedata
 
 # Dependency imports
 
-from six import PY2
-from six import unichr  # pylint: disable=redefined-builtin
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
+import six
 import tensorflow as tf
 
+xrange = six.moves.xrange  # pylint: disable=redefined-builtin
 
 # Conversion between Unicode and UTF-8, if required (on Python2)
-_native_to_unicode = (lambda s: s.decode("utf-8")) if PY2 else (lambda s: s)
+_native_to_unicode = (lambda s: s.decode("utf-8")) if six.PY2 else (lambda s: s)
 
 
 # This set contains all letter and number characters.
 _ALPHANUMERIC_CHAR_SET = set(
-    unichr(i) for i in xrange(sys.maxunicode)
-    if (unicodedata.category(unichr(i)).startswith("L") or
-        unicodedata.category(unichr(i)).startswith("N")))
+    six.unichr(i) for i in xrange(sys.maxunicode)
+    if (unicodedata.category(six.unichr(i)).startswith("L") or
+        unicodedata.category(six.unichr(i)).startswith("N")))
 
 
 def encode(text):
@@ -110,42 +108,86 @@ def decode(tokens):
   return "".join(ret)
 
 
-def corpus_token_counts(text_filepattern, corpus_max_lines,
-                        split_on_newlines=True):
+def _read_filepattern(filepattern, max_lines=None, split_on_newlines=True):
+  """Reads files matching a wildcard pattern, yielding the contents.
+
+  Args:
+    filepattern: A wildcard pattern matching one or more files.
+    max_lines: If set, stop reading after reading this many lines.
+    split_on_newlines: A boolean. If true, then split files by lines and strip
+        leading and trailing whitespace from each line. Otherwise, treat each
+        file as a single string.
+
+  Yields:
+    The contents of the files as lines, if split_on_newlines is True, or
+    the entire contents of each file if False.
+  """
+  filenames = tf.gfile.Glob(filepattern)
+  lines_read = 0
+  for filename in filenames:
+    with tf.gfile.Open(filename) as f:
+      if split_on_newlines:
+        for line in f:
+          yield line.strip()
+          lines_read += 1
+          if max_lines and lines_read >= max_lines:
+            return
+
+      else:
+        if max_lines:
+          doc = []
+          for line in f:
+            doc.append(line)
+            lines_read += 1
+            if max_lines and lines_read >= max_lines:
+              yield "".join(doc)
+              return
+          yield "".join(doc)
+
+        else:
+          yield f.read()
+
+
+def corpus_token_counts(
+    text_filepattern, corpus_max_lines, split_on_newlines=True):
   """Read the corpus and compute a dictionary of token counts.
 
   Args:
-    text_filepattern: a pattern matching one or more files
-    corpus_max_lines: an integer - maximum total lines to read.
-    split_on_newlines: a boolean.  If true, then split files by lines and strip
-      leading and trailing whitespace from each line.
+    text_filepattern: A pattern matching one or more files.
+    corpus_max_lines: An integer; maximum total lines to read.
+    split_on_newlines: A boolean. If true, then split files by lines and strip
+        leading and trailing whitespace from each line. Otherwise, treat each
+        file as a single string.
 
   Returns:
-    a dictionary from token to count.
+    a dictionary mapping token to count.
   """
-  def read_corpus():
-    """Read the corpus."""
-    docs = []
-    lines_read = 0
-    filenames = tf.gfile.Glob(text_filepattern)
-    for text_filename in filenames:
-      with tf.gfile.Open(text_filename) as f:
-        if not split_on_newlines:
-          docs.append("")
-        for line in f:
-          if split_on_newlines:
-            # The tokenizer updates token_counts in encode()
-            docs.append(line.strip())
-          else:
-            docs[-1] += line
-          lines_read += 1
-          if corpus_max_lines > 0 and lines_read > corpus_max_lines:
-            return docs
-    return docs
-
-  counts = defaultdict(int)
-  for doc in read_corpus():
-    for tok in encode(_native_to_unicode(doc)):
-      counts[tok] += 1
+  counts = collections.Counter()
+  for doc in _read_filepattern(
+      text_filepattern,
+      max_lines=corpus_max_lines,
+      split_on_newlines=split_on_newlines):
+    counts.update(encode(_native_to_unicode(doc)))
+
   return counts
 
+
+def vocab_token_counts(text_filepattern, max_lines):
+  """Read a vocab file and return a dictionary of token counts.
+
+  Reads a two-column CSV file of tokens and their frequency in a dataset. The
+  tokens are presumed to be generated by encode() or the equivalent.
+
+  Args:
+    text_filepattern: A pattern matching one or more files.
+    max_lines: An integer; maximum total lines to read.
+
+  Returns:
+    a dictionary mapping token to count.
+  """
+  ret = {}
+  for line in _read_filepattern(text_filepattern, max_lines=max_lines):
+    token, count = line.rsplit(",", 1)
+    ret[_native_to_unicode(token)] = int(count)
+
+  return ret
diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py
index 189f19663..792ef4dbb 100644
--- a/tensor2tensor/data_generators/tokenizer_test.py
+++ b/tensor2tensor/data_generators/tokenizer_test.py
@@ -20,45 +20,132 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
 import random
 
 # Dependency imports
 
-from six import unichr  # pylint: disable=redefined-builtin
-from six.moves import xrange  # pylint: disable=redefined-builtin
+import six
 from tensor2tensor.data_generators import tokenizer
-
 import tensorflow as tf
 
+xrange = six.moves.xrange  # pylint: disable=redefined-builtin
+
+FLAGS = tf.app.flags.FLAGS
+
+_TESTDATA = "google3/third_party/py/tensor2tensor/data_generators/test_data"
+
 
 class TokenizerTest(tf.test.TestCase):
 
-  def testEncode(self):
-    self.assertEqual(
-        tokenizer.encode(u"Dude - that's so cool."),
-        [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."])
-    self.assertEqual(
-        tokenizer.encode(u"Łukasz est né en 1981."),
-        [u"Łukasz", u"est", u"né", u"en", u"1981", u"."])
-    self.assertEqual(
-        tokenizer.encode(u" Spaces at the ends "),
-        [u" ", u"Spaces", u"at", u"the", u"ends", u" "])
-    self.assertEqual(tokenizer.encode(u"802.11b"), [u"802", u".", u"11b"])
-    self.assertEqual(tokenizer.encode(u"two. \nlines"),
-                     [u"two", u". \n", u"lines"])
+  def test_encode(self):
+    self.assertListEqual(
+        [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."],
+        tokenizer.encode(u"Dude - that's so cool."))
+    self.assertListEqual(
+        [u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
+        tokenizer.encode(u"Łukasz est né en 1981."))
+    self.assertListEqual(
+        [u" ", u"Spaces", u"at", u"the", u"ends", u" "],
+        tokenizer.encode(u" Spaces at the ends "))
+    self.assertListEqual(
+        [u"802", u".", u"11b"],
+        tokenizer.encode(u"802.11b"))
+    self.assertListEqual(
+        [u"two", u". \n", u"lines"],
+        tokenizer.encode(u"two. \nlines"))
 
-  def testDecode(self):
+  def test_decode(self):
     self.assertEqual(
+        u"Dude - that's so cool.",
         tokenizer.decode(
-            [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]),
-        u"Dude - that's so cool.")
+            [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]))
 
-  def testInvertibilityOnRandomStrings(self):
-    random.seed(123)
+  def test_invertibility_on_random_strings(self):
     for _ in xrange(1000):
-      s = u"".join([unichr(random.randint(0, 65535)) for _ in xrange(10)])
+      s = u"".join(
+          six.unichr(random.randint(0, 65535)) for _ in xrange(10))
       self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
 
 
+class TestTokenCounts(tf.test.TestCase):
+
+  def setUp(self):
+    super(TestTokenCounts, self).setUp()
+    self.corpus_path = os.path.join(
+        FLAGS.test_srcdir, _TESTDATA, "corpus-*.txt")
+    self.vocab_path = os.path.join(
+        FLAGS.test_srcdir, _TESTDATA, "vocab-*.txt")
+
+  def test_corpus_token_counts_split_on_newlines(self):
+    token_counts = tokenizer.corpus_token_counts(
+        self.corpus_path, corpus_max_lines=0, split_on_newlines=True)
+
+    expected = {
+        u"'": 2,
+        u".": 2,
+        u". ": 1,
+        u"... ": 1,
+        u"Groucho": 1,
+        u"Marx": 1,
+        u"Mitch": 1,
+        u"Hedberg": 1,
+        u"I": 3,
+        u"in": 2,
+        u"my": 2,
+        u"pajamas": 2,
+    }
+    self.assertDictContainsSubset(expected, token_counts)
+    self.assertNotIn(u".\n\n", token_counts)
+    self.assertNotIn(u"\n", token_counts)
+
+  def test_corpus_token_counts_no_split_on_newlines(self):
+    token_counts = tokenizer.corpus_token_counts(
+        self.corpus_path, corpus_max_lines=0, split_on_newlines=False)
+
+    self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts)
+
+  def test_corpus_token_counts_split_with_max_lines(self):
+    token_counts = tokenizer.corpus_token_counts(
+        self.corpus_path, corpus_max_lines=5, split_on_newlines=True)
+
+    self.assertIn(u"slept", token_counts)
+    self.assertNotIn(u"Mitch", token_counts)
+
+  def test_corpus_token_counts_no_split_with_max_lines(self):
+    token_counts = tokenizer.corpus_token_counts(
+        self.corpus_path, corpus_max_lines=5, split_on_newlines=False)
+
+    self.assertIn(u"slept", token_counts)
+    self.assertNotIn(u"Mitch", token_counts)
+    self.assertDictContainsSubset(
+        {u".\n\n": 1, u"\n": 2, u".\n": 1}, token_counts)
+
+  def test_vocab_token_counts(self):
+    token_counts = tokenizer.vocab_token_counts(
+        self.vocab_path, 0)
+
+    expected = {
+        "lollipop": 8,
+        "reverberated": 12,
+        "kattywampus": 11,
+        "balderdash": 10,
+        "jiggery-pokery": 14,
+    }
+    self.assertDictEqual(expected, token_counts)
+
+  def test_vocab_token_counts_with_max_lines(self):
+    token_counts = tokenizer.vocab_token_counts(
+        self.vocab_path, 4)
+
+    expected = {
+        "lollipop": 8,
+        "reverberated": 12,
+        "kattywampus": 11,
+        "balderdash": 10,
+    }
+    self.assertDictEqual(expected, token_counts)
+
+
 if __name__ == "__main__":
   tf.test.main()

From c01617efd2a2f321633ffaeaebc8697d46ed0dc0 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Wed, 26 Jul 2017 15:01:19 -0700
Subject: [PATCH 17/21] Use TensorFlow idiom for importing six.moves.xrange.

PiperOrigin-RevId: 163261434
---
 tensor2tensor/data_generators/text_encoder.py   | 3 +--
 tensor2tensor/data_generators/tokenizer.py      | 3 +--
 tensor2tensor/data_generators/tokenizer_test.py | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 69d29779a..4bb1c875d 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -30,12 +30,11 @@
 # Dependency imports
 
 import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensor2tensor.data_generators import tokenizer
 
 import tensorflow as tf
 
-xrange = six.moves.xrange  # pylint: disable=redefined-builtin
-
 # Reserved tokens for things like padding and EOS symbols.
 PAD = "<pad>"
 EOS = "<EOS>"
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
index 1acffc04c..5cb9fd32b 100644
--- a/tensor2tensor/data_generators/tokenizer.py
+++ b/tensor2tensor/data_generators/tokenizer.py
@@ -51,10 +51,9 @@
 # Dependency imports
 
 import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
-xrange = six.moves.xrange  # pylint: disable=redefined-builtin
-
 # Conversion between Unicode and UTF-8, if required (on Python2)
 _native_to_unicode = (lambda s: s.decode("utf-8")) if six.PY2 else (lambda s: s)
 
diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py
index 792ef4dbb..ad4a3ff04 100644
--- a/tensor2tensor/data_generators/tokenizer_test.py
+++ b/tensor2tensor/data_generators/tokenizer_test.py
@@ -26,11 +26,10 @@
 # Dependency imports
 
 import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensor2tensor.data_generators import tokenizer
 import tensorflow as tf
 
-xrange = six.moves.xrange  # pylint: disable=redefined-builtin
-
 FLAGS = tf.app.flags.FLAGS
 
 _TESTDATA = "google3/third_party/py/tensor2tensor/data_generators/test_data"

From 5242ac6e59cf553820d31485509fc527339ada92 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 26 Jul 2017 17:33:54 -0700
Subject: [PATCH 18/21] Rm num_shards from Problem. Problems specify sharding
 themselves.

PiperOrigin-RevId: 163281576
---
 README.md                                    |  1 -
 tensor2tensor/bin/t2t-datagen                | 11 +++++++----
 tensor2tensor/data_generators/algorithmic.py | 11 ++++-------
 tensor2tensor/data_generators/genetics.py    | 14 ++++++++------
 tensor2tensor/data_generators/image.py       |  2 +-
 tensor2tensor/data_generators/problem.py     |  2 +-
 tensor2tensor/data_generators/wmt.py         |  6 ++----
 7 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index c0e34e0fe..edd6460d0 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,6 @@ mkdir -p $DATA_DIR $TMP_DIR $TRAIN_DIR
 t2t-datagen \
   --data_dir=$DATA_DIR \
   --tmp_dir=$TMP_DIR \
-  --num_shards=100 \
   --problem=$PROBLEM
 
 # Train
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index 629014713..e4acb6731 100644
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -63,7 +63,8 @@ flags.DEFINE_string("problem", "",
                     "The name of the problem to generate data for.")
 flags.DEFINE_string("exclude_problems", "",
                     "Comma-separates list of problems to exclude.")
-flags.DEFINE_integer("num_shards", 10, "How many shards to use.")
+flags.DEFINE_integer("num_shards", 0, "How many shards to use. Ignored for "
+                     "registered Problems.")
 flags.DEFINE_integer("max_cases", 0,
                      "Maximum number of cases to generate (unbounded if 0).")
 flags.DEFINE_integer("random_seed", 429459, "Random seed to use.")
@@ -252,7 +253,7 @@ def generate_data_for_problem(problem):
   if isinstance(dev_gen, int):
     # The dev set and test sets are generated as extra shards using the
     # training generator.  The integer specifies the number of training
-    # shards.  FLAGS.num_shards is ignored.
+    # shards. FLAGS.num_shards is ignored.
     num_training_shards = dev_gen
     tf.logging.info("Generating data for %s.", problem)
     all_output_files = generator_utils.combined_data_filenames(
@@ -263,10 +264,11 @@ def generate_data_for_problem(problem):
   else:
     # usual case - train data and dev data are generated using separate
     # generators.
+    num_shards = FLAGS.num_shards or 10
     tf.logging.info("Generating training data for %s.", problem)
     train_output_files = generator_utils.train_data_filenames(
         problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
-        FLAGS.num_shards)
+        num_shards)
     generator_utils.generate_files(training_gen(), train_output_files,
                                    FLAGS.max_cases)
     tf.logging.info("Generating development data for %s.", problem)
@@ -282,11 +284,12 @@ def generate_data_for_problem(problem):
 
 def generate_data_for_registered_problem(problem_name):
   tf.logging.info("Generating training data for %s.", problem_name)
+  if FLAGS.num_shards:
+    raise ValueError("--num_shards should not be set for registered Problem.")
   problem = registry.problem(problem_name)
   task_id = None if FLAGS.task_id < 0 else FLAGS.task_id
   problem.generate_data(os.path.expanduser(FLAGS.data_dir),
                         os.path.expanduser(FLAGS.tmp_dir),
-                        num_shards=FLAGS.num_shards,
                         task_id=task_id)
 
 
diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py
index 017bc8470..c115a1ebe 100644
--- a/tensor2tensor/data_generators/algorithmic.py
+++ b/tensor2tensor/data_generators/algorithmic.py
@@ -66,10 +66,7 @@ def dev_size(self):
   def num_shards(self):
     return 10
 
-  def generate_data(self, data_dir, _, num_shards=None, task_id=-1):
-    if num_shards is None:
-      num_shards = self.num_shards
-
+  def generate_data(self, data_dir, _, task_id=-1):
     def generator_eos(generator):
       """Shift by NUM_RESERVED_IDS and append EOS token."""
       for case in generator:
@@ -87,7 +84,7 @@ def generator_eos(generator):
 
     utils.generate_dataset_and_shuffle(
         train_generator_eos(),
-        self.training_filepaths(data_dir, num_shards, shuffled=True),
+        self.training_filepaths(data_dir, self.num_shards, shuffled=True),
         dev_generator_eos(),
         self.dev_filepaths(data_dir, 1, shuffled=True),
         shuffle=False)
@@ -254,7 +251,7 @@ def zipf_distribution(nbr_symbols, alpha):
 
 
 def zipf_random_sample(distr_map, sample_len):
-  """Helper function: Generate a random Zipf sample of given lenght.
+  """Helper function: Generate a random Zipf sample of given length.
 
   Args:
     distr_map: list of float, Zipf's distribution over nbr_symbols.
@@ -287,7 +284,7 @@ def reverse_generator_nlplike(nbr_symbols,
     max_length: integer, maximum length of sequences to generate.
     nbr_cases: the number of cases to generate.
     scale_std_dev: float, Normal distribution's standard deviation scale factor
-      used to draw the lenght of sequence. Default = 1% of the max_length.
+      used to draw the length of sequence. Default = 1% of the max_length.
     alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
       Usually for modelling natural text distribution is in
       the range [1.1-1.6].
diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py
index 88b82cb49..4e8a6d987 100644
--- a/tensor2tensor/data_generators/genetics.py
+++ b/tensor2tensor/data_generators/genetics.py
@@ -87,10 +87,11 @@ def feature_encoders(self, data_dir):
         "targets": text_encoder.TextEncoder()
     }
 
-  def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
-    if num_shards is None:
-      num_shards = 100
+  @property
+  def num_shards(self):
+    return 100
 
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
     try:
       # Download source data if download_url specified
       h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file,
@@ -109,7 +110,7 @@ def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
     # Collect created shard processes to start and join
     processes = []
 
-    datasets = [(self.training_filepaths, num_shards, "train",
+    datasets = [(self.training_filepaths, self.num_shards, "train",
                  num_train_examples), (self.dev_filepaths, 1, "valid",
                                        num_dev_examples),
                 (self.test_filepaths, 1, "test", num_test_examples)]
@@ -124,9 +125,10 @@ def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
                   start_idx, end_idx))
         processes.append(p)
 
-    # Start and wait for processes in batches
-    assert len(processes) == num_shards + 2  # 1 per training shard + dev + test
+    # 1 per training shard + dev + test
+    assert len(processes) == self.num_shards + 2
 
+    # Start and wait for processes in batches
     num_batches = int(
         math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES))
     for i in xrange(num_batches):
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
index acb1128ed..fdad8d432 100644
--- a/tensor2tensor/data_generators/image.py
+++ b/tensor2tensor/data_generators/image.py
@@ -338,7 +338,7 @@ def example_reading_spec(self, label_key=None):
 class ImageFSNS(ImageProblem):
   """Problem spec for French Street Name recognition."""
 
-  def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
     list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/"
                 "street/python/fsns_urls.txt")
     fsns_urls = generator_utils.maybe_download(
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 69d81e58e..67e3c6f90 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -135,7 +135,7 @@ class Problem(object):
   # BEGIN SUBCLASS INTERFACE
   # ============================================================================
 
-  def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
     raise NotImplementedError()
 
   def hparams(self, defaults, model_hparams):
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index 9587d4d2a..97b191096 100644
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -83,12 +83,10 @@ def vocab_name(self):
   def vocab_file(self):
     return "%s.%d" % (self.vocab_name, self.targeted_vocab_size)
 
-  def generate_data(self, data_dir, tmp_dir, num_shards=None, task_id=-1):
-    if num_shards is None:
-      num_shards = self.num_shards
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
     generator_utils.generate_dataset_and_shuffle(
         self.train_generator(data_dir, tmp_dir, True),
-        self.training_filepaths(data_dir, num_shards, shuffled=False),
+        self.training_filepaths(data_dir, self.num_shards, shuffled=False),
         self.dev_generator(data_dir, tmp_dir),
         self.dev_filepaths(data_dir, 1, shuffled=False))
 

From 93b325f420d85d934d6280b316a248eca982c192 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 26 Jul 2017 18:18:04 -0700
Subject: [PATCH 19/21] Baseline model for GeneExpression problem

PiperOrigin-RevId: 163286026
---
 tensor2tensor/data_generators/all_problems.py |   2 +-
 .../{genetics.py => gene_expression.py}       |  39 +++---
 ...netics_test.py => gene_expression_test.py} |   8 +-
 tensor2tensor/models/common_layers.py         |  31 ++--
 tensor2tensor/models/gene_expression.py       | 132 ++++++++++++++++++
 tensor2tensor/models/gene_expression_test.py  |  79 +++++++++++
 tensor2tensor/models/modalities.py            |  15 +-
 tensor2tensor/models/models.py                |   1 +
 tensor2tensor/utils/metrics.py                |   2 +-
 tensor2tensor/utils/trainer_utils.py          |  12 +-
 10 files changed, 279 insertions(+), 42 deletions(-)
 rename tensor2tensor/data_generators/{genetics.py => gene_expression.py} (90%)
 rename tensor2tensor/data_generators/{genetics_test.py => gene_expression_test.py} (89%)
 create mode 100644 tensor2tensor/models/gene_expression.py
 create mode 100644 tensor2tensor/models/gene_expression_test.py

diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
index d8007f5e3..6830cf0bf 100644
--- a/tensor2tensor/data_generators/all_problems.py
+++ b/tensor2tensor/data_generators/all_problems.py
@@ -34,7 +34,7 @@
 # pylint: disable=g-import-not-at-top
 try:
   # Requires h5py
-  from tensor2tensor.data_generators import genetics
+  from tensor2tensor.data_generators import gene_expression
 except ImportError:
   pass
 # pylint: enable=g-import-not-at-top
diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/gene_expression.py
similarity index 90%
rename from tensor2tensor/data_generators/genetics.py
rename to tensor2tensor/data_generators/gene_expression.py
index 4e8a6d987..31d1cd150 100644
--- a/tensor2tensor/data_generators/genetics.py
+++ b/tensor2tensor/data_generators/gene_expression.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Genetics problems.
+"""Gene expression problems.
 
 Inputs are bases ACTG (with indices assigned in that order).
 
@@ -82,7 +82,7 @@ def chunk_size(self):
   def feature_encoders(self, data_dir):
     del data_dir
     return {
-        "inputs": GeneticBaseEncoder(chunk_size=self.chunk_size),
+        "inputs": DNAEncoder(chunk_size=self.chunk_size),
         # TODO(rsepassi): RealEncoder?
         "targets": text_encoder.TextEncoder()
     }
@@ -166,8 +166,15 @@ def example_reading_spec(self):
   def preprocess_examples(self, examples, mode):
     del mode
 
+    # Reshape targets
     examples["targets"] = tf.reshape(examples["targets"],
                                      [-1, 1, self.num_output_predictions])
+    examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1, 1])
+
+    # Set masked targets to 0 (i.e. pad) so that loss and metrics ignore them.
+    # Add epsilon because some unmasked labels are actually 0.
+    examples["targets"] += 1e-6
+    examples["targets"] *= examples["targets_mask"]
 
     return examples
 
@@ -175,8 +182,8 @@ def eval_metrics(self):
     return [metrics.Metrics.RMSE]
 
 
-@registry.register_problem("genetics_cage10")
-class GeneticsCAGE10(GeneExpressionProblem):
+@registry.register_problem("gene_expression_cage10")
+class GeneExpressionCAGE10(GeneExpressionProblem):
 
   @property
   def download_url(self):
@@ -187,8 +194,8 @@ def h5_file(self):
     return "cage10.h5"
 
 
-@registry.register_problem("genetics_gm12878")
-class GeneticsGM12878(GeneExpressionProblem):
+@registry.register_problem("gene_expression_gm12878")
+class GeneExpressionGM12878(GeneExpressionProblem):
 
   @property
   def download_url(self):
@@ -199,8 +206,8 @@ def h5_file(self):
     return "gm12878.h5"
 
 
-@registry.register_problem("genetics_l262k")
-class GeneticsL262k(GeneExpressionProblem):
+@registry.register_problem("gene_expression_l262k")
+class GeneExpressionL262k(GeneExpressionProblem):
 
   @property
   def h5_file(self):
@@ -236,7 +243,7 @@ def dataset_generator(filepath,
                       chunk_size=1,
                       start_idx=None,
                       end_idx=None):
-  encoder = GeneticBaseEncoder(chunk_size=chunk_size)
+  encoder = DNAEncoder(chunk_size=chunk_size)
   with h5py.File(filepath, "r") as h5_file:
     # Get input keys from h5_file
     src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]]
@@ -291,7 +298,7 @@ def to_example_dict(encoder, inputs, mask, outputs):
   return ex_dict
 
 
-class GeneticBaseEncoder(text_encoder.TextEncoder):
+class DNAEncoder(text_encoder.TextEncoder):
   """ACTG strings to ints and back. Optionally chunks bases into single ids.
 
   Uses 'X' as an unknown base.
@@ -302,14 +309,14 @@ class GeneticBaseEncoder(text_encoder.TextEncoder):
   def __init__(self,
                chunk_size=1,
                num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS):
-    super(GeneticBaseEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
+    super(DNAEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
     # Build a vocabulary of chunks of size chunk_size
     self._chunk_size = chunk_size
     chunks = []
     for size in range(1, chunk_size + 1):
-      c = itertools.product(_bases + [GeneticBaseEncoder.UNK], repeat=size)
+      c = itertools.product(_bases + [DNAEncoder.UNK], repeat=size)
       num_pad = chunk_size - size
-      padding = (GeneticBaseEncoder.PAD,) * num_pad
+      padding = (DNAEncoder.PAD,) * num_pad
       c = [el + padding for el in c]
       chunks.extend(c)
     chunks.sort()
@@ -323,7 +330,7 @@ def vocab_size(self):
 
   def encode(self, s):
     bases = list(s)
-    pad = [GeneticBaseEncoder.PAD] * (len(bases) % self._chunk_size)
+    pad = [DNAEncoder.PAD] * (len(bases) % self._chunk_size)
     bases.extend(pad)
     assert (len(bases) % self._chunk_size) == 0
     num_chunks = len(bases) // self._chunk_size
@@ -342,8 +349,8 @@ def decode(self, ids):
     for idx in ids:
       if idx >= self._num_reserved_ids:
         chunk = self._ids_to_chunk[idx]
-        if GeneticBaseEncoder.PAD in chunk:
-          chunk = chunk[:chunk.index(GeneticBaseEncoder.PAD)]
+        if DNAEncoder.PAD in chunk:
+          chunk = chunk[:chunk.index(DNAEncoder.PAD)]
       else:
         chunk = [text_encoder.RESERVED_TOKENS[idx]]
       bases.extend(chunk)
diff --git a/tensor2tensor/data_generators/genetics_test.py b/tensor2tensor/data_generators/gene_expression_test.py
similarity index 89%
rename from tensor2tensor/data_generators/genetics_test.py
rename to tensor2tensor/data_generators/gene_expression_test.py
index 5eac1b249..2d7bbe832 100644
--- a/tensor2tensor/data_generators/genetics_test.py
+++ b/tensor2tensor/data_generators/gene_expression_test.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 
-from tensor2tensor.data_generators import genetics
+from tensor2tensor.data_generators import gene_expression
 
 import tensorflow as tf
 
@@ -40,7 +40,7 @@ def _oneHotBases(self, bases):
     return np.array(one_hots)
 
   def testRecordToExample(self):
-    encoder = genetics.GeneticBaseEncoder(chunk_size=2)
+    encoder = gene_expression.DNAEncoder(chunk_size=2)
     raw_inputs = ["A", "C", "G", "X", "C", "T"]
 
     # Put in numpy arrays in the same format as in the h5 file
@@ -48,7 +48,7 @@ def testRecordToExample(self):
     mask = np.array([True, False, True])
     outputs = np.array([[1.0, 2.0, 3.0], [5.0, 1.0, 0.2], [5.1, 2.3, 2.3]])
     # Convert to example dict
-    ex_dict = genetics.to_example_dict(encoder, inputs, mask, outputs)
+    ex_dict = gene_expression.to_example_dict(encoder, inputs, mask, outputs)
 
     self.assertEqual(len(raw_inputs) // 2 + 1, len(ex_dict["inputs"]))
     self.assertAllEqual(encoder.encode(raw_inputs) + [1], ex_dict["inputs"])
@@ -61,7 +61,7 @@ def testGenerateShardArgs(self):
     num_examples = 37
     num_shards = 4
     outfiles = [str(i) for i in range(num_shards)]
-    shard_args = genetics.generate_shard_args(outfiles, num_examples)
+    shard_args = gene_expression.generate_shard_args(outfiles, num_examples)
 
     starts, ends, fnames = zip(*shard_args)
     self.assertAllEqual([0, 9, 18, 27], starts)
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
index 37e791bc3..e98531d88 100644
--- a/tensor2tensor/models/common_layers.py
+++ b/tensor2tensor/models/common_layers.py
@@ -469,7 +469,10 @@ def get_norm(norm_type):
                    "'noam', 'none'.")
 
 
-def residual_fn(x, y, norm_type, residual_dropout,
+def residual_fn(x,
+                y,
+                norm_type,
+                residual_dropout,
                 filters=None,
                 epsilon=1e-16,
                 name="residual"):
@@ -559,11 +562,17 @@ def conv_block_internal(conv_fn,
 
 
 def conv_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs):
-  """A block of standard convolutions."""
+  """A block of standard 2d convolutions."""
   return conv_block_internal(conv, inputs, filters,
                              dilation_rates_and_kernel_sizes, **kwargs)
 
 
+def conv1d_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs):
+  """A block of standard 1d convolutions."""
+  return conv_block_internal(conv1d, inputs, filters,
+                             dilation_rates_and_kernel_sizes, **kwargs)
+
+
 def separable_conv_block(inputs, filters, dilation_rates_and_kernel_sizes,
                          **kwargs):
   """A block of separable convolutions."""
@@ -858,10 +867,7 @@ def multiscale_conv_sum(inputs, output_size, dilation_rates_and_kernel_sizes,
     return tf.add_n(results) * (len(results)**-0.5)
 
 
-def multiscale_conv_and_attention(x,
-                                  padding,
-                                  hparams,
-                                  source=None):
+def multiscale_conv_and_attention(x, padding, hparams, source=None):
   """A common part of t2t layers.
 
   First, do a linear multiscale convolution
@@ -925,10 +931,7 @@ def conv_with_pools(inputs, output_size, kernel_size, pool_sizes, pooling_type,
     return tf.add_n(results) * (len(results)**-0.5)
 
 
-def conv_with_pools_and_attention(x,
-                                  padding,
-                                  hparams,
-                                  source=None):
+def conv_with_pools_and_attention(x, padding, hparams, source=None):
   """A common part of t2t layers.
 
   First, do conv_with_pools
@@ -1389,8 +1392,8 @@ def padded_cross_entropy(logits,
   vocab_size = tf.shape(logits)[-1]
   with tf.name_scope("padded_cross_entropy", [logits, labels]):
     pad_logits, pad_labels = pad_with_zeros(logits, labels)
-    xent = smoothing_cross_entropy(pad_logits, pad_labels,
-                                   vocab_size, confidence)
+    xent = smoothing_cross_entropy(pad_logits, pad_labels, vocab_size,
+                                   confidence)
     weights = weights_fn(pad_labels)
     if not reduce_sum:
       return xent * weights, weights
@@ -1493,8 +1496,8 @@ def linear_set_layer(layer_size,
       # Unfortunately tf doesn't support broadcasting via concat, but we can
       # simply add the transformed context to get the same effect.
       context = tf.expand_dims(context, axis=1)
-      cont_tfm = conv1d(context, layer_size, 1,
-                        activation=None, name="cont_conv")
+      cont_tfm = conv1d(
+          context, layer_size, 1, activation=None, name="cont_conv")
       outputs += cont_tfm
 
     if activation_fn is not None:
diff --git a/tensor2tensor/models/gene_expression.py b/tensor2tensor/models/gene_expression.py
new file mode 100644
index 000000000..bdb93509b
--- /dev/null
+++ b/tensor2tensor/models/gene_expression.py
@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Models for gene expression from DNA."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.models import common_hparams
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+@registry.register_model
+class GeneExpressionConv(t2t_model.T2TModel):
+  """Gene expression conv net.
+
+  Based on "Basenji" model from
+  http://www.biorxiv.org/content/early/2017/07/10/161851
+
+  Uses layer_norm instead of batch_norm.
+  """
+
+  def model_fn_body(self, features):
+    inputs = features["inputs"]
+    inputs.get_shape().assert_has_rank(4)
+
+    hp = self._hparams
+
+    out = inputs
+    out = common_layers.flatten4d3d(out)
+
+    # Conv layers
+    for i in xrange(hp.num_conv_layers):
+      out = conv_layer(
+          out,
+          hp.hidden_size,
+          hp.kernel_width,
+          hp.stride,
+          hp.pooling_windows[i],
+          hp.dropout,
+          1,
+          name="conv_%d" % (i + 1))
+
+    # Dense dilated conv layers
+    for i in xrange(hp.num_dconv_layers):
+      dilation_rate = 2**(i + 1)
+      dconv_out = conv_layer(
+          out,
+          hp.hidden_size,
+          hp.kernel_width,
+          1,
+          0,
+          hp.dropout,
+          dilation_rate,
+          name="dconv_%d" % (i + 1))
+      out = tf.concat([out, dconv_out], axis=2)
+
+    # Fully connected layer
+    out = fc_layer(out, hp.hidden_size, hp.dropout, name="fc")
+
+    out.get_shape().assert_has_rank(3)
+    out = tf.expand_dims(out, 2)
+    return out
+
+
+def conv_layer(x,
+               hidden_size,
+               kernel_size,
+               stride,
+               pooling_window,
+               dropout_rate,
+               dilation_rate,
+               name="conv"):
+  with tf.variable_scope(name):
+    out = x
+    out = common_layers.conv1d_block(
+        out,
+        hidden_size, [(dilation_rate, kernel_size)],
+        strides=stride,
+        first_relu=False,
+        padding="same")
+    out = tf.nn.relu(out)
+    if pooling_window:
+      out = tf.layers.max_pooling1d(
+          out, pooling_window, pooling_window, padding="same")
+    out = tf.layers.dropout(out, dropout_rate)
+    return out
+
+
+def fc_layer(x, num_out, dropout_rate, name="fc"):
+  with tf.variable_scope(name):
+    out = x
+    out = tf.layers.dense(out, num_out)
+    out = tf.contrib.layers.layer_norm(out)
+    out = tf.nn.relu(out)
+    out = tf.layers.dropout(out, dropout_rate)
+    return out
+
+
+@registry.register_hparams
+def gene_expression_conv_base():
+  """Hparams for GeneExpressionConv model."""
+  hparams = common_hparams.basic_params1()
+  hparams.add_hparam("num_conv_layers", 4)
+  hparams.add_hparam("num_dconv_layers", 7)
+  hparams.add_hparam("pooling_windows", [2, 4, 4, 4])
+
+  # TODO(rsepassi): Correct the values of these hyperparameters
+  hparams.hidden_size = 128
+  hparams.kernel_width = 128
+  hparams.add_hparam("stride", 1)
+  return hparams
diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py
new file mode 100644
index 000000000..bec5268fd
--- /dev/null
+++ b/tensor2tensor/models/gene_expression_test.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Gene Expression models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import gene_expression as gene_data
+from tensor2tensor.models import gene_expression
+from tensor2tensor.models import modalities  # pylint: disable=unused-import
+
+import tensorflow as tf
+
+
+def gene_expression_conv_test():
+  hparams = gene_expression.gene_expression_conv_base()
+  hparams.hidden_size = 8
+  hparams.num_dconv_layers = 2
+  return hparams
+
+
+class GeneExpressionModelsTest(tf.test.TestCase):
+
+  def _testModel(self, hparams, model_cls):
+    batch_size = 3
+    target_length = 6
+    target_out = 10  # GeneExpressionProblem.num_output_predictions
+    input_length = target_length * 128
+    input_vocab_size = 5
+
+    inputs = np.random.random_integers(
+        input_vocab_size, size=(batch_size, input_length, 1, 1))
+    targets = np.random.random_sample((batch_size, target_length, 1,
+                                       target_out))
+
+    features = {
+        "inputs": tf.constant(inputs, dtype=tf.int32),
+        "targets": tf.constant(targets, dtype=tf.float32),
+    }
+    p_hparams, = hparams.problems
+    sharded_logits, _, _ = model_cls(hparams, tf.contrib.learn.ModeKeys.TRAIN,
+                                     p_hparams).model_fn(features)
+    logits = tf.concat(sharded_logits, 0)
+
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      res = sess.run(logits)
+
+    self.assertEqual(res.shape, (batch_size, target_length, 1, target_out))
+
+  def testGeneExpressionModels(self):
+    models_hparams = [(gene_expression.GeneExpressionConv,
+                       gene_expression_conv_test())]
+    for model_cls, hparams in models_hparams:
+      hparams.add_hparam("data_dir", None)
+      p_hparams = gene_data.GeneExpressionCAGE10().internal_hparams(hparams)
+      hparams.problems = [p_hparams]
+      self._testModel(hparams, model_cls)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py
index 50a3da55d..20464c0a2 100644
--- a/tensor2tensor/models/modalities.py
+++ b/tensor2tensor/models/modalities.py
@@ -166,7 +166,7 @@ def targets_bottom(self, inputs):
 
   def top(self, body_output, _):
     with tf.variable_scope("rgb_softmax"):
-      # seperate embedding for each channel
+      # separate embedding for each channel
       # assuming the body output returns a tensor of shape
       # [batch_size, rows, cols, channels, self._body_input_depth]
       body_output_split = tf.split(body_output, self._channels, axis=3)
@@ -488,10 +488,15 @@ def top_sharded(self,
                                            sharded_targets)
 
     def l2_loss(predictions, targets):
-      return tf.reduce_mean(tf.pow(predictions - targets, 2))
-
-    loss = data_parallelism(l2_loss, sharded_predictions, sharded_targets)
-    return sharded_predictions, tf.add_n(loss)
+      with tf.name_scope("l2"):
+        weights = weights_fn(targets)
+        l2 = tf.pow(predictions - targets, 2)
+        return tf.reduce_sum(l2 * weights), tf.reduce_sum(weights)
+
+    loss_num, loss_den = data_parallelism(l2_loss, sharded_predictions,
+                                          sharded_targets)
+    loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den))
+    return sharded_predictions, loss
 
 
 @registry.register_image_modality("identity_no_pad")
diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py
index e92ddd3ed..907a801cf 100644
--- a/tensor2tensor/models/models.py
+++ b/tensor2tensor/models/models.py
@@ -27,6 +27,7 @@
 from tensor2tensor.models import attention_lm_moe
 from tensor2tensor.models import bluenet
 from tensor2tensor.models import bytenet
+from tensor2tensor.models import gene_expression
 from tensor2tensor.models import long_answer
 from tensor2tensor.models import lstm
 from tensor2tensor.models import modalities
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index 29f44b574..ae9ce3882 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -39,7 +39,7 @@ class Metrics(object):
   RMSE = "rmse"
 
 
-def padded_rmse(predictions, labels, weights_fn=common_layers.weights_all):
+def padded_rmse(predictions, labels, weights_fn=common_layers.weights_nonzero):
   predictions, labels = common_layers.pad_with_zeros(predictions, labels)
   targets = labels
   weights = weights_fn(targets)
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index bf42c36cc..1dbb84d4f 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -45,6 +45,7 @@
 
 import tensorflow as tf
 from tensorflow.contrib.learn.python.learn import learn_runner
+from tensorflow.python import debug
 from tensorflow.python.ops import init_ops
 
 # Number of samples to draw for an image input (in such cases as captioning)
@@ -55,6 +56,8 @@
 
 flags.DEFINE_bool("registry_help", False,
                   "If True, logs the contents of the registry and exits.")
+flags.DEFINE_bool("tfdbg", False,
+                  "If True, use the TF debugger CLI on train/eval.")
 flags.DEFINE_string("output_dir", "", "Base output directory for run.")
 flags.DEFINE_string("model", "", "Which model to use.")
 flags.DEFINE_string("hparams_set", "", "Which parameters to use.")
@@ -168,6 +171,12 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
       FLAGS.objective not in eval_metrics):
     raise ValueError("Tuning objective %s not among evaluation metrics %s" %
                      (FLAGS.objective, eval_metrics.keys()))
+  train_monitors = []
+  eval_hooks = []
+  if FLAGS.tfdbg:
+    hook = debug.LocalCLIDebugHook()
+    train_monitors.append(hook)
+    eval_hooks.append(hook)
   return tf.contrib.learn.Experiment(
       estimator=estimator,
       train_input_fn=input_fns["train"],
@@ -176,7 +185,8 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
       train_steps=train_steps,
       eval_steps=eval_steps,
       min_eval_frequency=FLAGS.local_eval_frequency,
-      train_monitors=[])
+      train_monitors=train_monitors,
+      eval_hooks=eval_hooks)
 
 
 def create_experiment_components(hparams, output_dir, data_dir, model_name):

From 175a125927961a366a023fa4925c15e39561e003 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 26 Jul 2017 19:19:52 -0700
Subject: [PATCH 20/21] v1.1.2

PiperOrigin-RevId: 163290663
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9da5293b9..66d51d7e1 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.1.1',
+    version='1.1.2',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',

From 36766d84aa3da941be1f74efb10fbc4b409500d4 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 27 Jul 2017 11:49:11 -0700
Subject: [PATCH 21/21] internal-external fixes and enable tests

PiperOrigin-RevId: 163370562
---
 setup.py                                      |  3 +
 .../data_generators/concatenate_examples.py   | 21 +++---
 tensor2tensor/data_generators/inspect.py      | 24 +++----
 .../text_encoder_build_subword.py             | 36 +++++-----
 tensor2tensor/data_generators/tokenizer.py    |  2 +-
 .../data_generators/tokenizer_test.py         | 65 +++++++++----------
 tensor2tensor/utils/trainer_utils.py          |  4 +-
 7 files changed, 74 insertions(+), 81 deletions(-)

diff --git a/setup.py b/setup.py
index 66d51d7e1..6be9aba04 100644
--- a/setup.py
+++ b/setup.py
@@ -12,6 +12,7 @@
     url='http://github.com/tensorflow/tensor2tensor',
     license='Apache 2.0',
     packages=find_packages(),
+    package_data={'tensor2tensor.data_generators': ['test_data/*']},
     scripts=[
         'tensor2tensor/bin/t2t-trainer',
         'tensor2tensor/bin/t2t-datagen',
@@ -26,6 +27,8 @@
         'tensorflow': ['tensorflow>=1.2.0rc1'],
         'tensorflow_gpu': ['tensorflow-gpu>=1.2.0rc1'],
     },
+    tests_require=['nose'],
+    test_suite='nose.collector',
     classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',
diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py
index 60ac7ea8f..9d7678fc4 100644
--- a/tensor2tensor/data_generators/concatenate_examples.py
+++ b/tensor2tensor/data_generators/concatenate_examples.py
@@ -34,7 +34,7 @@
               + subtokenizer.encode("target French Je t'aime.") + [1])
 }
 
-We add a dummy feature "inputs"=[0] for compatability with seq-to-seq models.
+We add a dummy feature "inputs"=[0] for compatibility with seq-to-seq models.
 
 If FLAGS.combine_to_length is nonzero, then we combine multiple examples into
 examples of a constant length, possibly with some padding at the end.
@@ -53,34 +53,33 @@
 from tensor2tensor.data_generators import text_encoder
 import tensorflow as tf
 
-tf.app.flags.DEFINE_string("vocab_file", "",
-                           "SubwordTextEncoder vocabulary file")
+tf.flags.DEFINE_string("vocab_file", "", "SubwordTextEncoder vocabulary file")
 
-tf.app.flags.DEFINE_boolean(
+tf.flags.DEFINE_boolean(
     "random_reverse", False,
     "If true, write half of the example with source/target reversed")
 
-tf.app.flags.DEFINE_boolean(
+tf.flags.DEFINE_boolean(
     "count_everything", False,
     "If true, assign positive weights to designators, source and target. "
     "If false, assign positive weights only to target.")
 
-tf.app.flags.DEFINE_string("source_domain_string", "English", "")
-tf.app.flags.DEFINE_string("target_domain_string", "French", "")
+tf.flags.DEFINE_string("source_domain_string", "English", "")
+tf.flags.DEFINE_string("target_domain_string", "French", "")
 
-tf.app.flags.DEFINE_integer(
+tf.flags.DEFINE_integer(
     "combine_to_length", 0,
     "If positive, concatenate examples to form examples with target length "
     " equal to this value. Targets are padded with subtoken id=0.")
 
-tf.app.flags.DEFINE_string("in_file", "", "input filename")
+tf.flags.DEFINE_string("in_file", "", "input filename")
 
-tf.app.flags.DEFINE_string(
+tf.flags.DEFINE_string(
     "out_prefix", "/usr/local/google/tmp/concat",
     "The output filename is equal to out_prefix plus "
     "the last 15 characters of in_file. (e.g. -00001-of-00100)")
 
-FLAGS = tf.app.flags.FLAGS
+FLAGS = tf.flags.FLAGS
 
 
 def _make_example(ids, weights, raw_num_bytes):
diff --git a/tensor2tensor/data_generators/inspect.py b/tensor2tensor/data_generators/inspect.py
index 6ba054d3c..848b74a2d 100644
--- a/tensor2tensor/data_generators/inspect.py
+++ b/tensor2tensor/data_generators/inspect.py
@@ -32,19 +32,16 @@
 
 import tensorflow as tf
 
-tf.app.flags.DEFINE_string("subword_text_encoder_filename", "",
-                           "SubwordTextEncoder vocabulary file")
-tf.app.flags.DEFINE_string("token_text_encoder_filename", "",
-                           "TokenTextEncoder vocabulary file")
-tf.app.flags.DEFINE_bool("byte_text_encoder", False,
-                         "use a ByteTextEncoder")
-tf.app.flags.DEFINE_string("input_filename", "", "input filename")
-tf.app.flags.DEFINE_bool("print_inputs", False,
-                         "Print decoded inputs to stdout")
-tf.app.flags.DEFINE_bool("print_targets", False,
-                         "Print decoded targets to stdout")
+tf.flags.DEFINE_string("subword_text_encoder_filename", "",
+                       "SubwordTextEncoder vocabulary file")
+tf.flags.DEFINE_string("token_text_encoder_filename", "",
+                       "TokenTextEncoder vocabulary file")
+tf.flags.DEFINE_bool("byte_text_encoder", False, "use a ByteTextEncoder")
+tf.flags.DEFINE_string("input_filename", "", "input filename")
+tf.flags.DEFINE_bool("print_inputs", False, "Print decoded inputs to stdout")
+tf.flags.DEFINE_bool("print_targets", False, "Print decoded targets to stdout")
 
-FLAGS = tf.app.flags.FLAGS
+FLAGS = tf.flags.FLAGS
 
 
 def main(_):
@@ -53,8 +50,7 @@ def main(_):
     encoder = text_encoder.SubwordTextEncoder(
         FLAGS.subword_text_encoder_filename)
   elif FLAGS.token_text_encoder_filename:
-    encoder = text_encoder.TokenTextEncoder(
-        FLAGS.token_text_encoder_filename)
+    encoder = text_encoder.TokenTextEncoder(FLAGS.token_text_encoder_filename)
   elif FLAGS.byte_text_encoder:
     encoder = text_encoder.ByteTextEncoder()
   else:
diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py
index 88dfac116..47e82a176 100644
--- a/tensor2tensor/data_generators/text_encoder_build_subword.py
+++ b/tensor2tensor/data_generators/text_encoder_build_subword.py
@@ -39,19 +39,18 @@
 
 import tensorflow as tf
 
-tf.app.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder',
-                           'where to store the SubwordTextEncoder')
-tf.app.flags.DEFINE_string('corpus_filepattern', '',
-                           'Corpus of one or more text files')
-tf.app.flags.DEFINE_string('vocab_filepattern', '',
-                           'One or more vocabulary files '
-                           '(one word per line as "word,count")')
-tf.app.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus')
-tf.app.flags.DEFINE_integer('corpus_max_lines', 10000,
-                            'How many lines of corpus to read')
-tf.app.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations')
-tf.app.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.')
-FLAGS = tf.app.flags.FLAGS
+tf.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder',
+                       'where to store the SubwordTextEncoder')
+tf.flags.DEFINE_string('corpus_filepattern', '',
+                       'Corpus of one or more text files')
+tf.flags.DEFINE_string('vocab_filepattern', '', 'One or more vocabulary files '
+                       '(one word per line as "word,count")')
+tf.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus')
+tf.flags.DEFINE_integer('corpus_max_lines', 10000,
+                        'How many lines of corpus to read')
+tf.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations')
+tf.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.')
+FLAGS = tf.flags.FLAGS
 
 
 def main(unused_argv):
@@ -61,20 +60,21 @@ def main(unused_argv):
 
   elif FLAGS.corpus_filepattern:
     token_counts = tokenizer.corpus_token_counts(
-        FLAGS.corpus_filepattern, FLAGS.corpus_max_lines,
+        FLAGS.corpus_filepattern,
+        FLAGS.corpus_max_lines,
         split_on_newlines=FLAGS.split_on_newlines)
 
   elif FLAGS.vocab_filepattern:
-    token_counts = tokenizer.vocab_token_counts(
-        FLAGS.vocab_filepattern, FLAGS.corpus_max_lines)
+    token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
+                                                FLAGS.corpus_max_lines)
 
   else:
     raise ValueError(
         'Must provide one of --corpus_filepattern or --vocab_filepattern')
 
   encoder = text_encoder.SubwordTextEncoder()
-  encoder.build_from_token_counts(
-      token_counts, FLAGS.min_count, FLAGS.num_iterations)
+  encoder.build_from_token_counts(token_counts, FLAGS.min_count,
+                                  FLAGS.num_iterations)
   encoder.store_to_file(FLAGS.output_fn)
 
 
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
index 5cb9fd32b..0e8daa75f 100644
--- a/tensor2tensor/data_generators/tokenizer.py
+++ b/tensor2tensor/data_generators/tokenizer.py
@@ -121,7 +121,7 @@ def _read_filepattern(filepattern, max_lines=None, split_on_newlines=True):
     The contents of the files as lines, if split_on_newlines is True, or
     the entire contents of each file if False.
   """
-  filenames = tf.gfile.Glob(filepattern)
+  filenames = sorted(tf.gfile.Glob(filepattern))
   lines_read = 0
   for filename in filenames:
     with tf.gfile.Open(filename) as f:
diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py
index ad4a3ff04..0c299bd0b 100644
--- a/tensor2tensor/data_generators/tokenizer_test.py
+++ b/tensor2tensor/data_generators/tokenizer_test.py
@@ -30,9 +30,10 @@
 from tensor2tensor.data_generators import tokenizer
 import tensorflow as tf
 
-FLAGS = tf.app.flags.FLAGS
+FLAGS = tf.flags.FLAGS
 
-_TESTDATA = "google3/third_party/py/tensor2tensor/data_generators/test_data"
+pkg_dir, _ = os.path.split(__file__)
+_TESTDATA = os.path.join(pkg_dir, "test_data")
 
 
 class TokenizerTest(tf.test.TestCase):
@@ -41,18 +42,13 @@ def test_encode(self):
     self.assertListEqual(
         [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."],
         tokenizer.encode(u"Dude - that's so cool."))
-    self.assertListEqual(
-        [u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
-        tokenizer.encode(u"Łukasz est né en 1981."))
-    self.assertListEqual(
-        [u" ", u"Spaces", u"at", u"the", u"ends", u" "],
-        tokenizer.encode(u" Spaces at the ends "))
-    self.assertListEqual(
-        [u"802", u".", u"11b"],
-        tokenizer.encode(u"802.11b"))
-    self.assertListEqual(
-        [u"two", u". \n", u"lines"],
-        tokenizer.encode(u"two. \nlines"))
+    self.assertListEqual([u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
+                         tokenizer.encode(u"Łukasz est né en 1981."))
+    self.assertListEqual([u" ", u"Spaces", u"at", u"the", u"ends", u" "],
+                         tokenizer.encode(u" Spaces at the ends "))
+    self.assertListEqual([u"802", u".", u"11b"], tokenizer.encode(u"802.11b"))
+    self.assertListEqual([u"two", u". \n", u"lines"],
+                         tokenizer.encode(u"two. \nlines"))
 
   def test_decode(self):
     self.assertEqual(
@@ -62,8 +58,7 @@ def test_decode(self):
 
   def test_invertibility_on_random_strings(self):
     for _ in xrange(1000):
-      s = u"".join(
-          six.unichr(random.randint(0, 65535)) for _ in xrange(10))
+      s = u"".join(six.unichr(random.randint(0, 65535)) for _ in xrange(10))
       self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
 
 
@@ -71,10 +66,8 @@ class TestTokenCounts(tf.test.TestCase):
 
   def setUp(self):
     super(TestTokenCounts, self).setUp()
-    self.corpus_path = os.path.join(
-        FLAGS.test_srcdir, _TESTDATA, "corpus-*.txt")
-    self.vocab_path = os.path.join(
-        FLAGS.test_srcdir, _TESTDATA, "vocab-*.txt")
+    self.corpus_path = os.path.join(_TESTDATA, "corpus-*.txt")
+    self.vocab_path = os.path.join(_TESTDATA, "vocab-*.txt")
 
   def test_corpus_token_counts_split_on_newlines(self):
     token_counts = tokenizer.corpus_token_counts(
@@ -117,31 +110,33 @@ def test_corpus_token_counts_no_split_with_max_lines(self):
 
     self.assertIn(u"slept", token_counts)
     self.assertNotIn(u"Mitch", token_counts)
-    self.assertDictContainsSubset(
-        {u".\n\n": 1, u"\n": 2, u".\n": 1}, token_counts)
+    self.assertDictContainsSubset({
+        u".\n\n": 1,
+        u"\n": 2,
+        u".\n": 1
+    }, token_counts)
 
   def test_vocab_token_counts(self):
-    token_counts = tokenizer.vocab_token_counts(
-        self.vocab_path, 0)
+    token_counts = tokenizer.vocab_token_counts(self.vocab_path, 0)
 
     expected = {
-        "lollipop": 8,
-        "reverberated": 12,
-        "kattywampus": 11,
-        "balderdash": 10,
-        "jiggery-pokery": 14,
+        u"lollipop": 8,
+        u"reverberated": 12,
+        u"kattywampus": 11,
+        u"balderdash": 10,
+        u"jiggery-pokery": 14,
     }
     self.assertDictEqual(expected, token_counts)
 
   def test_vocab_token_counts_with_max_lines(self):
-    token_counts = tokenizer.vocab_token_counts(
-        self.vocab_path, 4)
+    # vocab-1 has 2 lines, vocab-2 has 3
+    token_counts = tokenizer.vocab_token_counts(self.vocab_path, 4)
 
     expected = {
-        "lollipop": 8,
-        "reverberated": 12,
-        "kattywampus": 11,
-        "balderdash": 10,
+        u"lollipop": 8,
+        u"reverberated": 12,
+        u"kattywampus": 11,
+        u"balderdash": 10,
     }
     self.assertDictEqual(expected, token_counts)
 
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 1dbb84d4f..bf105c5ae 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -167,7 +167,7 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
       model_name=model_name)
   eval_metrics = metrics.create_evaluation_metrics(
       zip(FLAGS.problems.split("-"), hparams.problem_instances))
-  if ("autotune" in FLAGS and FLAGS.autotune and
+  if (hasattr(FLAGS, "autotune") and FLAGS.autotune and
       FLAGS.objective not in eval_metrics):
     raise ValueError("Tuning objective %s not among evaluation metrics %s" %
                      (FLAGS.objective, eval_metrics.keys()))
@@ -572,7 +572,7 @@ def nth_model(n):
     # Define the train_op for the TRAIN mode.
     opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams)
     tf.logging.info("Computing gradients for global model_fn.")
-    opt_summaries = ["learning_rate", "loss", "global_gradient_norm"]
+    opt_summaries = ["learning_rate", "loss"]
     if hparams.summarize_grads:
       opt_summaries.extend(["gradients", "gradient_norm"])
     train_op = tf.contrib.layers.optimize_loss(