diff --git a/docs/cloud_mlengine.md b/docs/cloud_mlengine.md
index b257fab25..0750f5088 100644
--- a/docs/cloud_mlengine.md
+++ b/docs/cloud_mlengine.md
@@ -41,8 +41,8 @@ principle work just fine. Contributions/testers welcome.
 Launching on Cloud ML Engine works with `--t2t_usr_dir` as well as long as the
 directory is fully self-contained (i.e. the imports only refer to other modules
 in the directory). If there are additional PyPI dependencies that you need, you
-can include a `setup.py` file in your directory (ensure that it uses
-`setuptools.find_packages`).
+can include a `requirements.txt` file in the directory specified by
+`t2t_usr_dir`.
 
 # Hyperparameter Tuning
 
diff --git a/docs/new_problem.md b/docs/new_problem.md
index 7564e4ad8..fab76d90d 100644
--- a/docs/new_problem.md
+++ b/docs/new_problem.md
@@ -65,10 +65,10 @@ class PoetryLines(text_problems.Text2TextProblem):
     # 10% evaluation data
     return [{
         "split": problem.DatasetSplit.TRAIN,
-        "shards": 90,
+        "shards": 9,
     }, {
         "split": problem.DatasetSplit.EVAL,
-        "shards": 10,
+        "shards": 1,
     }]
 
   def generate_samples(self, data_dir, tmp_dir, dataset_split):
@@ -133,7 +133,7 @@ pre-existing "training" and "evaluation" sets. If we did, we'd set
 split.
 
 The `dataset_splits` method determines the fraction that goes to each split. The
-training data will be generated into 90 files and the evaluation data into 10.
+training data will be generated into 9 files and the evaluation data into 1.
 90% of the data will be for training. 10% of the data will be for evaluation.
 
 ```python
@@ -148,10 +148,10 @@ training data will be generated into 90 files and the evaluation data into 10.
     # 10% evaluation data
     return [{
         "split": problem.DatasetSplit.TRAIN,
-        "shards": 90,
+        "shards": 9,
     }, {
         "split": problem.DatasetSplit.EVAL,
-        "shards": 10,
+        "shards": 1,
     }]
 ```
 
diff --git a/setup.py b/setup.py
index f02efdb2d..c30c752dd 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.5.2',
+    version='1.5.3',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',
diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
index 944ef016a..a9ab7177f 100644
--- a/tensor2tensor/bin/t2t_trainer.py
+++ b/tensor2tensor/bin/t2t_trainer.py
@@ -65,7 +65,7 @@
   flags.DEFINE_string("output_dir", "", "Base output directory for run.")
   flags.DEFINE_string("schedule", "continuous_train_and_eval",
                       "Method of Experiment to run.")
-  flags.DEFINE_integer("eval_steps", 10000,
+  flags.DEFINE_integer("eval_steps", 100,
                        "Number of steps in evaluation. By default, eval will "
                        "stop after eval_steps or when it runs through the eval "
                        "dataset once in full, whichever comes first, so this "
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index 4e72bf505..a23747c01 100644
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -165,6 +165,8 @@ def generate_files(generator, output_filenames, max_cases=None):
   for writer in writers:
     writer.close()
 
+  tf.logging.info("Generated %s Examples", counter)
+
 
 def download_report_hook(count, block_size, total_size):
   """Report hook for download progress.
@@ -198,19 +200,22 @@ def maybe_download(directory, filename, uri):
   """
   if not tf.gfile.Exists(directory):
     tf.logging.info("Creating directory %s" % directory)
-    os.mkdir(directory)
+    tf.gfile.MakeDirs(directory)
   filepath = os.path.join(directory, filename)
   if not tf.gfile.Exists(filepath):
     tf.logging.info("Downloading %s to %s" % (uri, filepath))
     try:
       tf.gfile.Copy(uri, filepath)
     except tf.errors.UnimplementedError:
-      inprogress_filepath = filepath + ".incomplete"
-      inprogress_filepath, _ = urllib.urlretrieve(
-          uri, inprogress_filepath, reporthook=download_report_hook)
-      # Print newline to clear the carriage return from the download progress
-      print()
-      tf.gfile.Rename(inprogress_filepath, filepath)
+      if uri.startswith("http"):
+        inprogress_filepath = filepath + ".incomplete"
+        inprogress_filepath, _ = urllib.urlretrieve(
+            uri, inprogress_filepath, reporthook=download_report_hook)
+        # Print newline to clear the carriage return from the download progress
+        print()
+        tf.gfile.Rename(inprogress_filepath, filepath)
+      else:
+        raise ValueError("Unrecognized URI: " + filepath)
     statinfo = os.stat(filepath)
     tf.logging.info("Successfully downloaded %s, %s bytes." %
                     (filename, statinfo.st_size))
@@ -232,7 +237,7 @@ def maybe_download_from_drive(directory, filename, url):
   """
   if not tf.gfile.Exists(directory):
     tf.logging.info("Creating directory %s" % directory)
-    os.mkdir(directory)
+    tf.gfile.MakeDirs(directory)
   filepath = os.path.join(directory, filename)
   confirm_token = None
   if tf.gfile.Exists(filepath):
diff --git a/tensor2tensor/data_generators/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py
index eea2616b5..22f5d1282 100644
--- a/tensor2tensor/data_generators/ice_parsing.py
+++ b/tensor2tensor/data_generators/ice_parsing.py
@@ -32,14 +32,10 @@
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
-from tensor2tensor.data_generators import translate
+from tensor2tensor.data_generators import text_problems
 from tensor2tensor.utils import registry
 
 
-# End-of-sentence marker.
-EOS = text_encoder.EOS_ID
-
-
 def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix,
                                    source_vocab_size, target_vocab_size):
   """Generate source and target data from a single file."""
@@ -51,8 +47,9 @@ def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix,
       data_dir, tmp_dir, filename, 1,
       prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size)
   pair_filepath = os.path.join(tmp_dir, filename)
-  return translate.tabbed_generator(pair_filepath, source_vocab, target_vocab,
-                                    EOS)
+  return text_problems.text2text_generate_encoded(
+      text_problems.text2text_txt_tab_iterator(pair_filepath), source_vocab,
+      target_vocab)
 
 
 def tabbed_parsing_character_generator(tmp_dir, train):
@@ -60,8 +57,8 @@ def tabbed_parsing_character_generator(tmp_dir, train):
   character_vocab = text_encoder.ByteTextEncoder()
   filename = "parsing_{0}.pairs".format("train" if train else "dev")
   pair_filepath = os.path.join(tmp_dir, filename)
-  return translate.tabbed_generator(pair_filepath, character_vocab,
-                                    character_vocab, EOS)
+  return text_problems.text2text_generate_encoded(
+      text_problems.text2text_txt_tab_iterator(pair_filepath), character_vocab)
 
 
 @registry.register_problem
@@ -114,8 +111,9 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
     source_vocab_size = self._encoders["inputs"].vocab_size
-    p.input_modality = {"inputs": (registry.Modalities.SYMBOL,
-                                   source_vocab_size)}
+    p.input_modality = {
+        "inputs": (registry.Modalities.SYMBOL, source_vocab_size)
+    }
     p.target_modality = (registry.Modalities.SYMBOL, self.targeted_vocab_size)
     p.input_space_id = self.input_space_id
     p.target_space_id = self.target_space_id
diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py
index fc4d0347e..4f14c1040 100644
--- a/tensor2tensor/data_generators/lm1b.py
+++ b/tensor2tensor/data_generators/lm1b.py
@@ -151,7 +151,7 @@ class LanguagemodelLm1b32k(text_problems.Text2TextProblem):
   """A language model on the 1B words corpus."""
 
   @property
-  def vocab_name(self):
+  def vocab_filename(self):
     return "vocab.lm1b.en.%d" % self.approx_vocab_size
 
   @property
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index ebcc0697d..fa4fbea96 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -162,7 +162,7 @@ class Problem(object):
           data_dir. Vocab files are newline-separated files with each line
           containing a token. The standard convention for the filename is to
           set it to be
-                  ${Problem.vocab_name}.${Problem.targeted_vocab_size}
+                  ${Problem.vocab_filename}.${Problem.targeted_vocab_size}
         - Downloads and other files can be written to tmp_dir
         - If you have a training and dev generator, you can generate the
           training and dev datasets with
@@ -721,6 +721,11 @@ def define_shapes(example):
       dataset = dataset.repeat()
       data_files = tf.contrib.slim.parallel_reader.get_data_files(
           self.filepattern(data_dir, mode))
+      #  In continuous_train_and_eval when switching between train and
+      #  eval, this input_fn method gets called multiple times and it
+      #  would give you the exact same samples from the last call
+      #  (because the Graph seed is set). So this skip gives you some
+      #  shuffling.
       dataset = skip_random_fraction(dataset, data_files[0])
 
     dataset = dataset.map(
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 985a93b30..fa057ade9 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -282,11 +282,12 @@ def _init_vocab_from_file(self, filename):
     Args:
       filename: The file to load vocabulary from.
     """
+    with tf.gfile.Open(filename) as f:
+      tokens = [token.strip() for token in f.readlines()]
+
     def token_gen():
-      with tf.gfile.Open(filename) as f:
-        for line in f:
-          token = line.strip()
-          yield token
+      for token in tokens:
+        yield token
 
     self._init_vocab(token_gen(), add_reserved_tokens=False)
 
@@ -379,7 +380,7 @@ def match(m):
     try:
       return six.unichr(int(m.group(1)))
     except (ValueError, OverflowError) as _:
-      return u"\u3013"
+      return u"\u3013"  # Unicode for undefined character.
 
   trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token
   return _UNESCAPE_REGEX.sub(match, trimmed)
@@ -827,11 +828,9 @@ def _load_from_file_object(self, f):
     self._init_alphabet_from_tokens(subtoken_strings)
 
   def _load_from_file(self, filename):
-    """Load from a file.
-
-    Args:
-      filename: Filename to load vocabulary from
-    """
+    """Load from a vocab file."""
+    if not tf.gfile.Exists(filename):
+      raise ValueError("File %s not found" % filename)
     with tf.gfile.Open(filename) as f:
       self._load_from_file_object(f)
 
diff --git a/tensor2tensor/data_generators/text_problems.py b/tensor2tensor/data_generators/text_problems.py
index 4184c974d..73e6bf4c7 100644
--- a/tensor2tensor/data_generators/text_problems.py
+++ b/tensor2tensor/data_generators/text_problems.py
@@ -222,15 +222,8 @@ def _maybe_pack_examples(self, generator):
   def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
     encoder = self.get_or_create_vocab(data_dir, tmp_dir)
-    for sample in generator:
-      targets = encoder.encode(sample["targets"])
-      targets.append(text_encoder.EOS_ID)
-      encoded_sample = {"targets": targets}
-      if self.has_inputs:
-        inputs = encoder.encode(sample["inputs"])
-        inputs.append(text_encoder.EOS_ID)
-        encoded_sample["inputs"] = inputs
-      yield encoded_sample
+    return text2text_generate_encoded(generator, encoder,
+                                      has_inputs=self.has_inputs)
 
   @property
   def batch_size_means_tokens(self):
@@ -244,15 +237,15 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
         problem.DatasetSplit.TEST: self.test_filepaths,
     }
 
-    split_paths = dict([(split["split"], filepath_fns[split["split"]](
+    split_paths = [(split["split"], filepath_fns[split["split"]](
         data_dir, split["shards"], shuffled=False))
-                        for split in self.dataset_splits])
+                   for split in self.dataset_splits]
     all_paths = []
-    for paths in split_paths.values():
+    for _, paths in split_paths:
       all_paths.extend(paths)
 
     if self.is_generate_per_split:
-      for split, paths in split_paths.items():
+      for split, paths in split_paths:
         generator_utils.generate_files(
             self._maybe_pack_examples(
                 self.generate_encoded_samples(data_dir, tmp_dir, split)), paths)
@@ -418,8 +411,7 @@ def example_reading_spec(self):
 def txt_line_iterator(txt_path):
   """Iterate through lines of file."""
   with tf.gfile.Open(txt_path) as f:
-    readline = lambda: f.readline()
-    for line in iter(readline, ""):
+    for line in f:
       yield line.strip()
 
 
@@ -472,11 +464,26 @@ def text2text_txt_tab_iterator(txt_path):
   """
   for line in txt_line_iterator(txt_path):
     if line and "\t" in line:
-      parts = line.split("\t")
+      parts = line.split("\t", 1)
       inputs, targets = parts[:2]
       yield {"inputs": inputs.strip(), "targets": targets.strip()}
 
 
+def text2text_generate_encoded(sample_generator,
+                               vocab,
+                               targets_vocab=None,
+                               has_inputs=True):
+  """Encode Text2Text samples from the generator with the vocab."""
+  targets_vocab = targets_vocab or vocab
+  for sample in sample_generator:
+    if has_inputs:
+      sample["inputs"] = vocab.encode(sample["inputs"])
+      sample["inputs"].append(text_encoder.EOS_ID)
+    sample["targets"] = targets_vocab.encode(sample["targets"])
+    sample["targets"].append(text_encoder.EOS_ID)
+    yield sample
+
+
 @registry.register_problem
 class Text2textTmpdir(Text2TextProblem):
   """Allows training a Text2TextProblem without defining a subclass.
diff --git a/tensor2tensor/data_generators/translate.py b/tensor2tensor/data_generators/translate.py
index bf3196144..435d1dfe2 100644
--- a/tensor2tensor/data_generators/translate.py
+++ b/tensor2tensor/data_generators/translate.py
@@ -52,9 +52,8 @@ def vocab_data_files(self):
     return self.source_data_files(problem.DatasetSplit.TRAIN)
 
   def generate_samples(self, data_dir, tmp_dir, dataset_split):
-    train = dataset_split == problem.DatasetSplit.TRAIN
     datasets = self.source_data_files(dataset_split)
-    tag = "train" if train else "dev"
+    tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
     data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name,
                                                                     tag))
 
@@ -67,127 +66,6 @@ def generate_samples(self, data_dir, tmp_dir, dataset_split):
                                                 data_path + ".lang2")
 
 
-# Generic generators used later for multiple problems.
-
-
-def character_generator(source_path, target_path, character_vocab, eos=None):
-  """Generator for sequence-to-sequence tasks that just uses characters.
-
-  This generator assumes the files at source_path and target_path have
-  the same number of lines and yields dictionaries of "inputs" and "targets"
-  where inputs are characters from the source lines converted to integers,
-  and targets are characters from the target lines, also converted to integers.
-
-  Args:
-    source_path: path to the file with source sentences.
-    target_path: path to the file with target sentences.
-    character_vocab: a TextEncoder to encode the characters.
-    eos: integer to append at the end of each sequence (default: None).
-  Yields:
-    A dictionary {"inputs": source-line, "targets": target-line} where
-    the lines are integer lists converted from characters in the file lines.
-  """
-  eos_list = [] if eos is None else [eos]
-  with tf.gfile.GFile(source_path, mode="r") as source_file:
-    with tf.gfile.GFile(target_path, mode="r") as target_file:
-      source, target = source_file.readline(), target_file.readline()
-      while source and target:
-        source_ints = character_vocab.encode(source.strip()) + eos_list
-        target_ints = character_vocab.encode(target.strip()) + eos_list
-        yield {"inputs": source_ints, "targets": target_ints}
-        source, target = source_file.readline(), target_file.readline()
-
-
-def tabbed_generator(source_path, source_vocab, target_vocab, eos=None):
-  r"""Generator for sequence-to-sequence tasks using tabbed files.
-
-  Tokens are derived from text files where each line contains both
-  a source and a target string. The two strings are separated by a tab
-  character ('\t'). It yields dictionaries of "inputs" and "targets" where
-  inputs are characters from the source lines converted to integers, and
-  targets are characters from the target lines, also converted to integers.
-
-  Args:
-    source_path: path to the file with source and target sentences.
-    source_vocab: a SubwordTextEncoder to encode the source string.
-    target_vocab: a SubwordTextEncoder to encode the target string.
-    eos: integer to append at the end of each sequence (default: None).
-  Yields:
-    A dictionary {"inputs": source-line, "targets": target-line} where
-    the lines are integer lists converted from characters in the file lines.
-  """
-  eos_list = [] if eos is None else [eos]
-  with tf.gfile.GFile(source_path, mode="r") as source_file:
-    for line in source_file:
-      if line and "\t" in line:
-        parts = line.split("\t", 1)
-        source, target = parts[0].strip(), parts[1].strip()
-        source_ints = source_vocab.encode(source) + eos_list
-        target_ints = target_vocab.encode(target) + eos_list
-        yield {"inputs": source_ints, "targets": target_ints}
-
-
-def token_generator(source_path, target_path, token_vocab, eos=None):
-  """Generator for sequence-to-sequence tasks that uses tokens.
-
-  This generator assumes the files at source_path and target_path have
-  the same number of lines and yields dictionaries of "inputs" and "targets"
-  where inputs are token ids from the " "-split source (and target, resp.) lines
-  converted to integers using the token_map.
-
-  Args:
-    source_path: path to the file with source sentences.
-    target_path: path to the file with target sentences.
-    token_vocab: text_encoder.TextEncoder object.
-    eos: integer to append at the end of each sequence (default: None).
-  Yields:
-    A dictionary {"inputs": source-line, "targets": target-line} where
-    the lines are integer lists converted from tokens in the file lines.
-  """
-  eos_list = [] if eos is None else [eos]
-  with tf.gfile.GFile(source_path, mode="r") as source_file:
-    with tf.gfile.GFile(target_path, mode="r") as target_file:
-      source, target = source_file.readline(), target_file.readline()
-      while source and target:
-        source_ints = token_vocab.encode(source.strip()) + eos_list
-        target_ints = token_vocab.encode(target.strip()) + eos_list
-        yield {"inputs": source_ints, "targets": target_ints}
-        source, target = source_file.readline(), target_file.readline()
-
-
-def bi_vocabs_token_generator(source_path,
-                              target_path,
-                              source_token_vocab,
-                              target_token_vocab,
-                              eos=None):
-  """Generator for sequence-to-sequence tasks that uses tokens.
-
-  This generator assumes the files at source_path and target_path have
-  the same number of lines and yields dictionaries of "inputs" and "targets"
-  where inputs are token ids from the " "-split source (and target, resp.) lines
-  converted to integers using the token_map.
-
-  Args:
-    source_path: path to the file with source sentences.
-    target_path: path to the file with target sentences.
-    source_token_vocab: text_encoder.TextEncoder object.
-    target_token_vocab: text_encoder.TextEncoder object.
-    eos: integer to append at the end of each sequence (default: None).
-  Yields:
-    A dictionary {"inputs": source-line, "targets": target-line} where
-    the lines are integer lists converted from tokens in the file lines.
-  """
-  eos_list = [] if eos is None else [eos]
-  with tf.gfile.GFile(source_path, mode="r") as source_file:
-    with tf.gfile.GFile(target_path, mode="r") as target_file:
-      source, target = source_file.readline(), target_file.readline()
-      while source and target:
-        source_ints = source_token_vocab.encode(source.strip()) + eos_list
-        target_ints = target_token_vocab.encode(target.strip()) + eos_list
-        yield {"inputs": source_ints, "targets": target_ints}
-        source, target = source_file.readline(), target_file.readline()
-
-
 def _preprocess_sgm(line, is_sgm):
   """Preprocessing to strip tags in SGM files."""
   if not is_sgm:
@@ -209,14 +87,19 @@ def _preprocess_sgm(line, is_sgm):
 def compile_data(tmp_dir, datasets, filename):
   """Concatenate all `datasets` and save to `filename`."""
   filename = os.path.join(tmp_dir, filename)
-  with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_resfile:
-    with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_resfile:
+  lang1_fname = filename + ".lang1"
+  lang2_fname = filename + ".lang2"
+  if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname):
+    tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname,
+                    lang2_fname)
+  with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile:
+    with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile:
       for dataset in datasets:
         url = dataset[0]
         compressed_filename = os.path.basename(url)
         compressed_filepath = os.path.join(tmp_dir, compressed_filename)
-
-        generator_utils.maybe_download(tmp_dir, compressed_filename, url)
+        if url.startswith("http"):
+          generator_utils.maybe_download(tmp_dir, compressed_filename, url)
 
         if dataset[1][0] == "tsv":
           _, src_column, trg_column, glob_pattern = dataset[1]
@@ -232,13 +115,17 @@ def compile_data(tmp_dir, datasets, filename):
               new_filename = tsv_filename.strip(".gz")
               generator_utils.gunzip_file(tsv_filename, new_filename)
               tsv_filename = new_filename
-            with tf.gfile.GFile(tsv_filename, mode="r") as tsv_file:
+            with tf.gfile.Open(tsv_filename) as tsv_file:
               for line in tsv_file:
                 if line and "\t" in line:
                   parts = line.split("\t")
                   source, target = parts[src_column], parts[trg_column]
-                  lang1_resfile.write(source.strip() + "\n")
-                  lang2_resfile.write(target.strip() + "\n")
+                  source, target = source.strip(), target.strip()
+                  if source and target:
+                    lang1_resfile.write(source)
+                    lang1_resfile.write("\n")
+                    lang2_resfile.write(target)
+                    lang2_resfile.write("\n")
         else:
           lang1_filename, lang2_filename = dataset[1]
           lang1_filepath = os.path.join(tmp_dir, lang1_filename)
@@ -246,8 +133,8 @@ def compile_data(tmp_dir, datasets, filename):
           is_sgm = (
               lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm"))
 
-          if not (os.path.exists(lang1_filepath) and
-                  os.path.exists(lang2_filepath)):
+          if not (tf.gfile.Exists(lang1_filepath) and
+                  tf.gfile.Exists(lang2_filepath)):
             # For .tar.gz and .tgz files, we read compressed.
             mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
             with tarfile.open(compressed_filepath, mode) as corpus_tar:
@@ -260,15 +147,15 @@ def compile_data(tmp_dir, datasets, filename):
             new_filepath = lang2_filepath.strip(".gz")
             generator_utils.gunzip_file(lang2_filepath, new_filepath)
             lang2_filepath = new_filepath
-          with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
-            with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file:
-              line1, line2 = lang1_file.readline(), lang2_file.readline()
-              while line1 or line2:
-                line1res = _preprocess_sgm(line1, is_sgm)
-                line2res = _preprocess_sgm(line2, is_sgm)
-                if line1res or line2res:
-                  lang1_resfile.write(line1res.strip() + "\n")
-                  lang2_resfile.write(line2res.strip() + "\n")
-                line1, line2 = lang1_file.readline(), lang2_file.readline()
+
+          for example in text_problems.text2text_txt_iterator(
+              lang1_filepath, lang2_filepath):
+            line1res = _preprocess_sgm(example["inputs"], is_sgm)
+            line2res = _preprocess_sgm(example["targets"], is_sgm)
+            if line1res and line2res:
+              lang1_resfile.write(line1res)
+              lang1_resfile.write("\n")
+              lang2_resfile.write(line2res)
+              lang2_resfile.write("\n")
 
   return filename
diff --git a/tensor2tensor/data_generators/translate_ende.py b/tensor2tensor/data_generators/translate_ende.py
index b493ec5c9..2a1e52c2f 100644
--- a/tensor2tensor/data_generators/translate_ende.py
+++ b/tensor2tensor/data_generators/translate_ende.py
@@ -142,6 +142,14 @@ def packed_length(self):
     return 256
 
 
+@registry.register_problem
+class TranslateEndeWmt8kPacked(TranslateEndeWmt8k):
+
+  @property
+  def packed_length(self):
+    return 256
+
+
 @registry.register_problem
 class TranslateEndeWmtCharacters(translate.TranslateProblem):
   """Problem spec for WMT En-De translation."""
diff --git a/tensor2tensor/data_generators/translate_enzh.py b/tensor2tensor/data_generators/translate_enzh.py
index 01f9d8fc1..444fc9834 100644
--- a/tensor2tensor/data_generators/translate_enzh.py
+++ b/tensor2tensor/data_generators/translate_enzh.py
@@ -26,6 +26,7 @@
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.data_generators import text_problems
 from tensor2tensor.data_generators import translate
 from tensor2tensor.utils import registry
 
@@ -47,9 +48,11 @@
 # This dataset is only a small fraction of full WMT17 task
 _NC_TRAIN_DATASETS = [[
     "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12"
-    ".tgz",
-    ["training/news-commentary-v12.zh-en.en",
-     "training/news-commentary-v12.zh-en.zh"]]]
+    ".tgz", [
+        "training/news-commentary-v12.zh-en.en",
+        "training/news-commentary-v12.zh-en.zh"
+    ]
+]]
 
 # Test set from News Commentary. 2000 lines
 _NC_TEST_DATASETS = [[
@@ -65,8 +68,8 @@
 # place into tmp directory e.g. /tmp/t2t_datagen/dataset.tgz
 _UN_TRAIN_DATASETS = [[
     "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/UNv1.0.en-zh.tar"
-    ".gz",
-    ["en-zh/UNv1.0.en-zh.en", "en-zh/UNv1.0.en-zh.zh"]]]
+    ".gz", ["en-zh/UNv1.0.en-zh.en", "en-zh/UNv1.0.en-zh.zh"]
+]]
 
 # CWMT corpus
 # Visit source website to download manually:
@@ -81,57 +84,79 @@
 # NOTE: You need to register to download dataset from official source
 # place into tmp directory e.g. /tmp/t2t_datagen/dataset.tgz
 
-_CWMT_TRAIN_DATASETS = [
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/casia2015/casia2015_en.txt", "cwmt/casia2015/casia2015_ch.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/casict2015/casict2015_en.txt",
-      "cwmt/casict2015/casict2015_ch.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/neu2017/NEU_en.txt", "cwmt/neu2017/NEU_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2015/datum_en.txt", "cwmt/datum2015/datum_ch.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book1_en.txt", "cwmt/datum2017/Book1_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book2_en.txt", "cwmt/datum2017/Book2_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book3_en.txt", "cwmt/datum2017/Book3_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book4_en.txt", "cwmt/datum2017/Book4_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book5_en.txt", "cwmt/datum2017/Book5_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book6_en.txt", "cwmt/datum2017/Book6_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book7_en.txt", "cwmt/datum2017/Book7_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book8_en.txt", "cwmt/datum2017/Book8_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book9_en.txt", "cwmt/datum2017/Book9_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book10_en.txt", "cwmt/datum2017/Book10_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book11_en.txt", "cwmt/datum2017/Book11_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book12_en.txt", "cwmt/datum2017/Book12_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book13_en.txt", "cwmt/datum2017/Book13_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book14_en.txt", "cwmt/datum2017/Book14_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book15_en.txt", "cwmt/datum2017/Book15_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book16_en.txt", "cwmt/datum2017/Book16_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book17_en.txt", "cwmt/datum2017/Book17_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book18_en.txt", "cwmt/datum2017/Book18_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book19_en.txt", "cwmt/datum2017/Book19_cn.txt"]],
-    ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-     ["cwmt/datum2017/Book20_en.txt", "cwmt/datum2017/Book20_cn.txt"]]
-]
+_CWMT_TRAIN_DATASETS = [[
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/casia2015/casia2015_en.txt", "cwmt/casia2015/casia2015_ch.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/casict2015/casict2015_en.txt", "cwmt/casict2015/casict2015_ch.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/neu2017/NEU_en.txt", "cwmt/neu2017/NEU_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2015/datum_en.txt", "cwmt/datum2015/datum_ch.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book1_en.txt", "cwmt/datum2017/Book1_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book2_en.txt", "cwmt/datum2017/Book2_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book3_en.txt", "cwmt/datum2017/Book3_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book4_en.txt", "cwmt/datum2017/Book4_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book5_en.txt", "cwmt/datum2017/Book5_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book6_en.txt", "cwmt/datum2017/Book6_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book7_en.txt", "cwmt/datum2017/Book7_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book8_en.txt", "cwmt/datum2017/Book8_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book9_en.txt", "cwmt/datum2017/Book9_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book10_en.txt", "cwmt/datum2017/Book10_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book11_en.txt", "cwmt/datum2017/Book11_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book12_en.txt", "cwmt/datum2017/Book12_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book13_en.txt", "cwmt/datum2017/Book13_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book14_en.txt", "cwmt/datum2017/Book14_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book15_en.txt", "cwmt/datum2017/Book15_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book16_en.txt", "cwmt/datum2017/Book16_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book17_en.txt", "cwmt/datum2017/Book17_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book18_en.txt", "cwmt/datum2017/Book18_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book19_en.txt", "cwmt/datum2017/Book19_cn.txt"]
+], [
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
+    ["cwmt/datum2017/Book20_en.txt", "cwmt/datum2017/Book20_cn.txt"]
+]]
 
 
 def get_filename(dataset):
@@ -215,9 +240,10 @@ def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     tag = "train" if train else "dev"
     filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
-    return translate.bi_vocabs_token_generator(data_path + ".lang1",
-                                               data_path + ".lang2",
-                                               source_vocab, target_vocab, EOS)
+    return text_problems.text2text_generate_encoded(
+        text_problems.text2text_txt_iterator(data_path + ".lang1",
+                                             data_path + ".lang2"),
+        source_vocab, target_vocab)
 
   def feature_encoders(self, data_dir):
     source_vocab_filename = os.path.join(data_dir, self.source_vocab_name)
diff --git a/tensor2tensor/data_generators/translate_test.py b/tensor2tensor/data_generators/translate_test.py
index f28b47818..201898352 100644
--- a/tensor2tensor/data_generators/translate_test.py
+++ b/tensor2tensor/data_generators/translate_test.py
@@ -19,64 +19,71 @@
 from __future__ import division
 from __future__ import print_function
 
-import io
 import os
-import tempfile
+import shutil
+import tarfile
 
 # Dependency imports
 
-import six
-from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.data_generators import text_problems
 from tensor2tensor.data_generators import translate
 
 import tensorflow as tf
 
 
 class TranslateTest(tf.test.TestCase):
+  DATASETS = [
+      ["data1.tgz", ("train1.en", "train1.de")],
+      ["data2.tgz", ("train2.en", "train2.de")],
+      ["data3.tgz", ("train3.en", "train3.de")],
+  ]
 
-  def testCharacterGenerator(self):
-    # Generate a trivial source and target file.
-    tmp_dir = self.get_temp_dir()
-    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
-    if six.PY2:
-      enc_f = lambda s: s
-    else:
-      enc_f = lambda s: s.encode("utf-8")
-    with io.open(tmp_file_path + ".src", "wb") as src_file:
-      src_file.write(enc_f("source1\n"))
-      src_file.write(enc_f("source2\n"))
-    with io.open(tmp_file_path + ".tgt", "wb") as tgt_file:
-      tgt_file.write(enc_f("target1\n"))
-      tgt_file.write(enc_f("target2\n"))
-
-    # Call character generator on the generated files.
-    results_src, results_tgt = [], []
-    character_vocab = text_encoder.ByteTextEncoder()
-    for dictionary in translate.character_generator(
-        tmp_file_path + ".src", tmp_file_path + ".tgt", character_vocab):
-      self.assertEqual(sorted(list(dictionary)), ["inputs", "targets"])
-      results_src.append(dictionary["inputs"])
-      results_tgt.append(dictionary["targets"])
-
-    # Check that the results match the files.
-    # First check that the results match the encoded original strings;
-    # this is a comparison of integer arrays.
-    self.assertEqual(len(results_src), 2)
-    self.assertEqual(results_src[0], character_vocab.encode("source1"))
-    self.assertEqual(results_src[1], character_vocab.encode("source2"))
-    self.assertEqual(results_tgt[0], character_vocab.encode("target1"))
-    self.assertEqual(results_tgt[1], character_vocab.encode("target2"))
-    # Then decode the results and compare with the original strings;
-    # this is a comparison of strings
-    self.assertEqual(character_vocab.decode(results_src[0]), "source1")
-    self.assertEqual(character_vocab.decode(results_src[1]), "source2")
-    self.assertEqual(character_vocab.decode(results_tgt[0]), "target1")
-    self.assertEqual(character_vocab.decode(results_tgt[1]), "target2")
-
-    # Clean up.
-    os.remove(tmp_file_path + ".src")
-    os.remove(tmp_file_path + ".tgt")
-    os.remove(tmp_file_path)
+  @classmethod
+  def setUpClass(cls):
+    tmp_dir = tf.test.get_temp_dir()
+    compressed_dir = os.path.join(tmp_dir, "compressed")
+    shutil.rmtree(tmp_dir)
+    tf.gfile.MakeDirs(compressed_dir)
+
+    en_data = [str(i) for i in range(10, 40)]
+    de_data = [str(i) for i in range(100, 130)]
+    data = list(zip(en_data, de_data))
+
+    for i, dataset in enumerate(cls.DATASETS):
+      tar_file = dataset[0]
+      en_file, de_file = [
+          os.path.join(compressed_dir, name) for name in dataset[1]
+      ]
+      with tf.gfile.Open(en_file, "w") as en_f:
+        with tf.gfile.Open(de_file, "w") as de_f:
+          start = i * 10
+          end = start + 10
+          for en_line, de_line in data[start:end]:
+            en_f.write(en_line)
+            en_f.write("\n")
+            de_f.write(de_line)
+            de_f.write("\n")
+
+      with tarfile.open(os.path.join(tmp_dir, tar_file), "w:gz") as tar_f:
+        tar_f.add(en_file, os.path.basename(en_file))
+        tar_f.add(de_file, os.path.basename(de_file))
+
+    cls.tmp_dir = tmp_dir
+    cls.data = data
+
+  def testCompileData(self):
+    filename = "out"
+    filepath = os.path.join(self.tmp_dir, filename)
+    translate.compile_data(self.tmp_dir, self.DATASETS, filename)
+
+    count = 0
+    for i, example in enumerate(
+        text_problems.text2text_txt_iterator(filepath + ".lang1",
+                                             filepath + ".lang2")):
+      expected = self.data[i]
+      self.assertEqual(list(expected), [example["inputs"], example["targets"]])
+      count += 1
+    self.assertEqual(count, len(self.data))
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py
index 33a77b746..9909a1267 100644
--- a/tensor2tensor/data_generators/wiki.py
+++ b/tensor2tensor/data_generators/wiki.py
@@ -223,7 +223,7 @@ class LanguagemodelWikiNorefV8kL1k(LanguagemodelWikiXmlV8kL1k):
   """
 
   @property
-  def vocab_name(self):
+  def vocab_filename(self):
     return "vocab.wiki_noref"
 
   def filepath_to_unicode_text(self, filepath):
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index d567016a5..a9346c34d 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -1510,7 +1510,6 @@ def masked_local_attention_1d(q, k, v, block_length=128,
   """
   with tf.variable_scope(
       name, default_name="local_attention_1d", values=[q, k, v]):
-    v_shape = v.get_shape()
     batch = common_layers.shape_list(q)[0]
     heads = common_layers.shape_list(q)[1]
     length = common_layers.shape_list(q)[2]
@@ -1534,7 +1533,11 @@ def masked_local_attention_1d(q, k, v, block_length=128,
     q = tf.pad(q, padding)
     k = tf.pad(k, padding)
     v = tf.pad(v, padding)
-    num_blocks = tf.div(length, block_length)
+
+    if isinstance(length, int) and isinstance(block_length, int):
+      num_blocks = length // block_length
+    else:
+      num_blocks = tf.div(length, block_length)
 
     # compute attention for the first query block.
     first_q = tf.slice(q, [0, 0, 0, 0], [-1, -1, block_length, -1])
@@ -1553,17 +1556,21 @@ def masked_local_attention_1d(q, k, v, block_length=128,
     k = tf.reshape(k, [batch, heads, num_blocks, block_length, depth_k])
     v = tf.reshape(v, [batch, heads, num_blocks, block_length, depth_v])
 
-    def local(x):
+    def local(x, depth):
       """Create a local version of the keys or values."""
       prev_block = tf.slice(x, [0, 0, 0, 0, 0],
                             [-1, -1, num_blocks - 1, -1, -1])
       cur_block = tf.slice(x, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1])
-      return tf.concat([prev_block, cur_block], 3)
+      local_block = tf.concat([prev_block, cur_block], 3)
+      return tf.reshape(local_block,
+                        [batch, heads, num_blocks - 1,
+                         block_length * 2, depth])
 
-    local_k = local(k)
-    local_v = local(v)
+    local_k = local(k, depth_k)
+    local_v = local(v, depth_v)
     tail_q = tf.slice(q, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1])
-
+    tail_q = tf.reshape(tail_q, [batch, heads, num_blocks - 1,
+                                 block_length, depth_k])
     local_length = common_layers.shape_list(local_k)[3]
 
     # [batch, heads, num_blocks - 1, block_length, local_length]
@@ -1579,10 +1586,11 @@ def local(x):
     # The naive way currently causes errors due to empty tensors.
     # output: [batch, heads, num_blocks-1, block_length, depth_v]
     output = tf.matmul(attention, local_v)
-    output = tf.reshape(output, [batch, heads, -1, depth_v])
+    output = tf.reshape(output, [
+        batch, heads, (num_blocks-1)*block_length, depth_v])
     output = tf.concat([first_output, output], axis=2)
     output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1])
-    output.set_shape(v_shape)
+    output = tf.reshape(output, [batch, heads, original_length, depth_v])
     return output
 
 
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 159006b9b..c4c1cf885 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -61,13 +61,25 @@ def basic_params1():
       optimizer_adam_beta2=0.997,
       optimizer_momentum_momentum=0.9,
       optimizer_momentum_nesterov=False,
+      optimizer_adafactor_beta1=0.0,
+      optimizer_adafactor_beta2=0.999,
+      optimizer_adafactor_factored=True,
+      optimizer_adafactor_decay_type="pow",
+      optimizer_adafactor_memory_exponent=0.8,
+      optimizer_adafactor_clipping_threshold=1.0,
+      optimizer_adafactor_multiply_by_parameter_scale=True,
       weight_decay=1e-6,
       weight_noise=0.0,
-      learning_rate_schedule="warmup_and_decay",
-      # If learning_rate_schedule=="warmup_and_decay", then this specifies
-      # the decay part of the schedule.
-      # The warmup is always exponential.
-      # TODO(noam): add a hyperparameter to control the warmup.
+      # Defines the learning rate as a product of named functions.
+      # Available functions are listed in learning_rate._LEARNING_RATE_FUNCTIONS
+      # e.g. "constant*linear_warmup*rsqrt_decay*rsqrt_hidden_size"
+      learning_rate_schedule="legacy",
+      learning_rate_constant=1.0,
+      # If learning_rate_schedule=="legacy",
+      # then we specify decay scheme here.  Warmup is always exponential,
+      # except with "noam" learning rate decay scheme.
+      # see optimize.legacy_learning_rate_schedule()
+      # TODO(noam): migrate everyone away from this.
       learning_rate_decay_scheme="none",
       # decay_steps and decay_staircase for learning_rate_decay_scheme=="exp"
       learning_rate_decay_steps=5000,
diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py
index f830ac977..0e7ac4a4e 100644
--- a/tensor2tensor/layers/common_image_attention.py
+++ b/tensor2tensor/layers/common_image_attention.py
@@ -45,14 +45,13 @@ def get_choices():
     ]
 
 
-def maybe_reshape_4d_to_3d(x, hparams):
+def maybe_reshape_4d_to_3d(x):
   """Reshape input from 4D to 3D if necessary."""
   x_shape = common_layers.shape_list(x)
   is_4d = False
   if len(x_shape) == 4:
     x = tf.reshape(x, [x_shape[0], x_shape[1]*x_shape[2], x_shape[3]])
     is_4d = True
-  x.set_shape([None, None, hparams.hidden_size])
   return x, x_shape, is_4d
 
 
@@ -82,7 +81,7 @@ def local_attention_1d(x,
                        kv_padding="VALID"):
   """Local 1d self attention."""
   # self-attention
-  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams)
+  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x)
   with tf.variable_scope("local_1d_self_att"):
     y = common_attention.multihead_attention(
         x,
@@ -104,7 +103,6 @@ def local_attention_1d(x,
         name="self_attention")
     if is_4d:
       y = tf.reshape(y, x_shape)
-      y.set_shape([None, None, None, hparams.hidden_size])
     return y
 
 
@@ -117,7 +115,7 @@ def dilated_attention_1d(x,
                          gap_size=2):
   """Dilated 1d self attention."""
   # self-attention
-  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams)
+  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x)
   with tf.variable_scope("masked_dilated_1d"):
     y = common_attention.multihead_attention(
         x,
@@ -195,7 +193,7 @@ def full_self_attention(x,
                         q_padding="LEFT",
                         kv_padding="LEFT"):
   """Full self-attention layer."""
-  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams)
+  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x)
   with tf.variable_scope("self_att"):
     y = common_attention.multihead_attention(
         x,
@@ -221,8 +219,8 @@ def encdec_attention_1d(x,
                         encoder_output,
                         hparams):
   """Local 1d self attention."""
-  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams)
-  encoder_output, _, _ = maybe_reshape_4d_to_3d(encoder_output, hparams)
+  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x)
+  encoder_output, _, _ = maybe_reshape_4d_to_3d(encoder_output)
   with tf.variable_scope("encdec_attention"):
     # Encoder Decoder attention
     y = common_attention.multihead_attention(
@@ -518,11 +516,12 @@ def prepare_decoder(targets, hparams):
     x = add_pos_signals(x, hparams, "dec_pos")
   else:
     # Add position signals
-    x = tf.reshape(x, [-1, x_shape[1]*x_shape[2], hparams.hidden_size])
+    x = tf.reshape(x, [targets_shape[0],
+                       x_shape[1]*x_shape[2], hparams.hidden_size])
     x = common_layers.shift_right_3d(x)
-    x = tf.reshape(x, [-1, x_shape[1], x_shape[2], hparams.hidden_size])
+    x = tf.reshape(x, [targets_shape[0],
+                       x_shape[1], x_shape[2], hparams.hidden_size])
     x = add_pos_signals(x, hparams, "dec_pos")
-  x.set_shape([None, None, None, hparams.hidden_size])
   return x, x_shape[1], x_shape[2], bias
 
 
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 1bfc97248..87d60e911 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -216,11 +216,12 @@ def dropout_no_scaling(x, keep_prob):
 
 
 def embedding(x, vocab_size, dense_size, name=None, reuse=None, multiplier=1.0,
-              symbol_dropout_rate=0.0):
+              symbol_dropout_rate=0.0, embedding_var=None):
   """Embed x of type int64 into dense vectors, reducing to max 4 dimensions."""
   with tf.variable_scope(
       name, default_name="embedding", values=[x], reuse=reuse):
-    embedding_var = tf.get_variable("kernel", [vocab_size, dense_size])
+    if embedding_var is None:
+      embedding_var = tf.get_variable("kernel", [vocab_size, dense_size])
     # On the backwards pass, we want to convert the gradient from
     # an indexed-slices to a regular tensor before sending it back to the
     # parameter server. This avoids excess computation on the parameter server.
@@ -2388,7 +2389,7 @@ def ones_matrix_band_part(rows, cols, num_lower, num_upper, out_shape=None):
       num_lower = rows - 1
     if num_upper < 0:
       num_upper = cols - 1
-    lower_mask = np.tri(rows, cols, num_lower).T
+    lower_mask = np.tri(cols, rows, num_lower).T
     upper_mask = np.tri(rows, cols, num_upper)
     band = np.ones((rows, cols)) * lower_mask * upper_mask
     if out_shape:
diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
new file mode 100644
index 000000000..ccb00ab6b
--- /dev/null
+++ b/tensor2tensor/layers/discretization.py
@@ -0,0 +1,589 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Discretization bottlenecks used to train discrete latent variables.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from functools import partial
+# Dependency imports
+from tensor2tensor.layers import common_layers
+import tensorflow as tf
+from tensorflow.python.training import moving_averages
+
+
+def project_hidden(x, projection_tensors, hidden_size, num_blocks):
+  """Project encoder hidden state into block_dim using projection tensors.
+
+  Args:
+    x: Encoder hidden state of shape [-1, hidden_size].
+    projection_tensors: Projection tensors used to project the hidden state.
+    hidden_size: Dimension of the latent space.
+    num_blocks: Number of blocks in DVQ.
+
+  Returns:
+    Projected states of shape [-1, num_blocks, block_dim].
+  """
+  x = tf.reshape(x, shape=[1, -1, hidden_size])
+  x_tiled = tf.reshape(
+      tf.tile(x, multiples=[num_blocks, 1, 1]),
+      shape=[num_blocks, -1, hidden_size])
+  x_projected = tf.matmul(x_tiled, projection_tensors)
+  x_projected = tf.transpose(x_projected, perm=[1, 0, 2])
+  return x_projected
+
+
+def slice_hidden(x, hidden_size, num_blocks):
+  """Slice encoder hidden state into block_dim.
+
+  Args:
+    x: Encoder hidden state of shape [-1, hidden_size].
+    hidden_size: Dimension of the latent space.
+    num_blocks: Number of blocks in DVQ.
+
+  Returns:
+    Sliced states of shape [-1, num_blocks, block_dim].
+  """
+  block_dim = int(hidden_size // num_blocks)
+  x_sliced = tf.reshape(x, shape=[-1, num_blocks, block_dim])
+  return x_sliced
+
+
+def nearest_neighbor(x, means, block_v_size, random_top_k=1):
+  """Find the nearest element in means to elements in x.
+
+  Args:
+    x: Batch of encoder continuous latent states sliced/projected into shape
+      [-1, num_blocks, block_dim].
+    means: Embedding table of shpae [num_blocks, block_v_size, block_dim].
+    block_v_size: Number of table entries per block.
+    random_top_k: Noisy top-k if this is bigger than 1 (Default: 1).
+
+  Returns:
+    Tensor with nearest element in mean encoded in one-hot notation.
+  """
+  x_norm_sq = tf.reduce_sum(tf.square(x), axis=-1, keep_dims=True)
+  means_norm_sq = tf.reduce_sum(tf.square(means), axis=-1, keep_dims=True)
+  scalar_prod = tf.matmul(
+      tf.transpose(x, perm=[1, 0, 2]), tf.transpose(means, perm=[0, 2, 1]))
+  scalar_prod = tf.transpose(scalar_prod, perm=[1, 0, 2])
+  dist = x_norm_sq + tf.transpose(
+      means_norm_sq, perm=[2, 0, 1]) - 2 * scalar_prod
+  if random_top_k > 1:
+    _, top_k_idx = tf.nn.top_k(-dist, k=random_top_k)
+    nearest_idx = tf.gather(
+        top_k_idx,
+        tf.random_uniform(
+            [1], minval=0, maxval=random_top_k - 1, dtype=tf.int32),
+        axis=-1)
+  else:
+    nearest_idx = tf.argmax(-dist, axis=-1)
+  nearest_hot = tf.one_hot(nearest_idx, block_v_size)
+  return tf.stop_gradient(nearest_hot)
+
+
+def embedding_lookup(x, means, num_blocks, block_v_size, random_top_k=1):
+  """Compute nearest neighbors and loss for training the embeddings via DVQ.
+
+  Args:
+    x: Batch of encoder continuous latent states sliced/projected into shape
+      [-1, num_blocks, block_dim].
+    means: Embedding table of shape [num_blocks, block_v_size, block_dim].
+    num_blocks: Number of blocks in DVQ.
+    block_v_size: Number of table entries per block.
+    random_top_k: Noisy top-k if this is bigger than 1 (Default: 1).
+
+  Returns:
+    The nearest neighbor in one hot form, the nearest neighbor itself, the
+    commitment loss, embedding training loss.
+  """
+  x_means_hot = nearest_neighbor(x, means, block_v_size, random_top_k)
+  x_means_hot_flat = tf.reshape(x_means_hot, [-1, num_blocks, block_v_size])
+  x_means = tf.matmul(tf.transpose(x_means_hot_flat, perm=[1, 0, 2]), means)
+  x_means = tf.transpose(x_means, [1, 0, 2])
+  q_loss = tf.reduce_mean(tf.square((tf.stop_gradient(x) - x_means)))
+  e_loss = tf.reduce_mean(tf.square(x - tf.stop_gradient(x_means)))
+  return x_means_hot, x_means, q_loss, e_loss
+
+
+def bit_to_int(x_bit, num_bits, base=2):
+  """Turn x_bit representing numbers bitwise (lower-endian) to int tensor.
+
+  Args:
+    x_bit: Tensor containing numbers in a particular base to be converted to
+      int.
+    num_bits: Number of bits in the representation.
+    base: Base of the representation.
+
+  Returns:
+    Integer representation of this number.
+  """
+  x_l = tf.stop_gradient(tf.to_int32(tf.reshape(x_bit, [-1, num_bits])))
+  x_labels = []
+  for i in range(num_bits):
+    x_labels.append(x_l[:, i] * tf.to_int32(base)**tf.to_int32(i))
+  res = sum(x_labels)
+  return tf.to_int32(tf.reshape(res, common_layers.shape_list(x_bit)[:-1]))
+
+
+def int_to_bit(x_int, num_bits, base=2):
+  """Turn x_int representing numbers into a bitwise (lower-endian) tensor.
+
+  Args:
+    x_int: Tensor containing integer to be converted into base notation.
+    num_bits: Number of bits in the representation.
+    base: Base of the representation.
+
+  Returns:
+    Corresponding number expressed in base.
+  """
+  x_l = tf.to_int32(tf.expand_dims(x_int, axis=-1))
+  x_labels = []
+  for i in range(num_bits):
+    x_labels.append(
+        tf.floormod(
+            tf.floordiv(tf.to_int32(x_l),
+                        tf.to_int32(base)**i), tf.to_int32(base)))
+  res = tf.concat(x_labels, axis=-1)
+  return tf.to_float(res)
+
+
+def embed(x,
+          hidden_size,
+          z_size,
+          filter_size,
+          name,
+          bottleneck_kind='dvq',
+          num_blocks=2,
+          block_v_size=None,
+          means=None):
+  """Embedding function that takes discrete latent and returns embedding.
+
+  Args:
+    x: Input to the discretization bottleneck.
+    hidden_size: Dimension of the latent state.
+    z_size: Number of bits used to produce discrete code; discrete codes range
+      from 1 to 2**z_size.
+    filter_size: Filter size to be used for the embedding function.
+    name: Name for the bottleneck scope.
+    bottleneck_kind: Kind of discretization bottleneck to use; one of dvq,
+      semhash, gumbel-softmax.
+    num_blocks: Number of blocks in DVQ.
+    block_v_size: Number of embedding entries per block.
+    means: The embedding table for dvq (Default: None).
+
+  Returns:
+    Continuous embedding to be passed on to the decoder.
+
+  Raises:
+    ValueError: For unknown or missing arguments.
+  """
+  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
+    if bottleneck_kind == 'semhash':
+      c = int_to_bit(x, z_size)
+      h1a = tf.layers.dense(c, filter_size, name='vch1a')
+      h1b = tf.layers.dense(1.0 - c, filter_size, name='vch1b')
+      h1 = h1a + h1b
+    elif bottleneck_kind == 'gumbel-softmax':
+      hot = tf.one_hot(x, 2**z_size)
+      h1 = tf.layers.dense(hot, hidden_size, name='dae_dense')
+    elif bottleneck_kind == 'dvq':
+      if block_v_size is None:
+        raise ValueError('Bottleneck kind is dvq but block_v_size is None.')
+
+      shape_x = common_layers.shape_list(x)
+      x_flat = tf.reshape(x, [-1, 1])
+      c = int_to_bit(x_flat, num_bits=z_size, base=2)
+      shape = common_layers.shape_list(c)
+      new_shape = shape
+      new_shape[-1] = num_blocks
+      new_shape.append(int(z_size / num_blocks))
+      c = tf.to_int32(tf.reshape(c, shape=new_shape))
+      c = bit_to_int(c, num_bits=int(z_size / num_blocks), base=2)
+      c_hot = tf.one_hot(c, depth=block_v_size, axis=-1)
+      c_hot_flat = tf.reshape(c_hot, shape=[-1, num_blocks, block_v_size])
+      h1 = tf.matmul(tf.transpose(c_hot_flat, perm=[1, 0, 2]), means)
+      h1 = tf.transpose(h1, perm=[1, 0, 2])
+      new_shape = shape_x
+      new_shape.append(hidden_size)
+      h1 = tf.reshape(h1, new_shape)
+    elif bottleneck_kind == 'rounding':
+      h1 = x
+    else:
+      raise ValueError('Unknown bottleneck kind.')
+
+    h2 = tf.layers.dense(tf.nn.relu(h1), filter_size, name='vch2')
+    return tf.layers.dense(tf.nn.relu(h2), hidden_size, name='vcfin')
+
+
+def vae(x, name, z_size):
+  """Simple variational autoencoder without discretization.
+
+  Args:
+    x: Input to the discretization bottleneck.
+    name: Name for the bottleneck scope.
+    z_size: Number of bits used to produce discrete code; discrete codes range
+      from 1 to 2**z_size.
+
+  Returns:
+    Embedding function, latent, loss, mu and log_simga.
+  """
+  with tf.variable_scope(name):
+    mu = tf.layers.dense(x, z_size, name='mu')
+    log_sigma = tf.layers.dense(x, z_size, name='log_sigma')
+    shape = common_layers.shape_list(x)
+    epsilon = tf.random_normal([shape[0], shape[1], 1, z_size])
+    z = mu + tf.exp(log_sigma / 2) * epsilon
+    kl = 0.5 * tf.reduce_mean(
+        tf.exp(log_sigma) + tf.square(mu) - 1. - log_sigma, axis=-1)
+    free_bits = z_size // 4
+    kl_loss = tf.reduce_mean(tf.maximum(kl - free_bits, 0.0))
+    return z, kl_loss, mu, log_sigma
+
+
+def top_k_softmax(x, k):
+  """Calculate softmax(x), select top-k and rescale to sum to 1.
+
+  Args:
+    x: Input to softmax over.
+    k: Number of top-k to select.
+
+  Returns:
+    softmax(x) and maximum item.
+  """
+  x = tf.nn.softmax(x)
+  top_x, _ = tf.nn.top_k(x, k=k + 1)
+  min_top = tf.reduce_min(top_x, axis=-1, keep_dims=True)
+  x = tf.nn.relu((x - min_top) + 1e-12)
+  x /= tf.reduce_sum(x, axis=-1, keep_dims=True)
+  return x, tf.reduce_max(top_x, axis=-1)
+
+
+def gumbel_sample(shape):
+  """Sample from the Gumbel distribution, protect from overflows.
+
+  Args:
+    shape: Shape of Gumbel samples.
+
+  Returns:
+    Noise drawn from Gumbel distribution.
+  """
+  uniform_samples = tf.random_uniform(shape, minval=0.00001, maxval=0.99998)
+  return -tf.log(-tf.log(uniform_samples))
+
+
+def gumbel_softmax(x,
+                   name,
+                   z_size,
+                   mode,
+                   softmax_k=0,
+                   kl_warmup_steps=150000,
+                   summary=True):
+  """Gumbel softmax discretization bottleneck.
+
+  Args:
+    x: Input to the discretization bottleneck.
+    name: Name for the bottleneck scope.
+    z_size: Number of bits used to produce discrete code; discrete codes range
+      from 1 to 2**z_size.
+    mode: Mode represents whether we are training or testing for bottlenecks
+      that differ in behavior (Default: None).
+    softmax_k: If > 1 then do top-k softmax (Default: 0).
+    kl_warmup_steps: Number of steps for kl warmup (Default: 150000).
+    summary: If True, then write summaries (Default: True).
+
+  Returns:
+    Embedding function, discrete code and loss.
+  """
+  with tf.variable_scope(name):
+    m = tf.layers.dense(x, 2**z_size, name='mask')
+    if softmax_k > 0:
+      m, kl = top_k_softmax(m, softmax_k)
+      return m, m, 1.0 - tf.reduce_mean(kl)
+    logsm = tf.nn.log_softmax(m)
+
+    # Gumbel-softmax sample.
+    gumbel_samples = gumbel_sample(common_layers.shape_list(m))
+    steps = kl_warmup_steps
+    gumbel_samples *= common_layers.inverse_exp_decay(steps // 5) * 0.5
+    temperature = 1.2 - common_layers.inverse_lin_decay(steps)
+
+    # 10% of the time keep reasonably high temperature to keep learning.
+    temperature = tf.cond(
+        tf.less(tf.random_uniform([]), 0.9), lambda: temperature,
+        lambda: tf.random_uniform([], minval=0.5, maxval=1.0))
+    s = tf.nn.softmax((logsm + gumbel_samples) / temperature)
+    m = tf.nn.softmax(m)
+    kl = -tf.reduce_max(logsm, axis=-1)
+
+    if summary:
+      tf.summary.histogram('max-log', tf.reshape(kl, [-1]))
+
+    # Calculate the argmax and construct hot vectors.
+    maxvec = tf.reshape(tf.argmax(m, axis=-1), [-1])
+    maxvhot = tf.stop_gradient(tf.one_hot(maxvec, 2**z_size))
+
+    # Add losses that prevent too few being used.
+    distrib = tf.reshape(logsm, [-1, 2**z_size]) * maxvhot
+    d_mean = tf.reduce_mean(distrib, axis=[0], keep_dims=True)
+    d_variance = tf.reduce_mean(tf.square(distrib - d_mean), axis=[0])
+    d_dev = -tf.reduce_mean(d_variance)
+    ret = s
+
+    if mode != tf.contrib.learn.ModeKeys.TRAIN:
+      ret = tf.reshape(maxvhot, common_layers.shape_list(s))  # Just hot @eval.
+    return m, ret, d_dev * 5.0 + tf.reduce_mean(kl) * 0.002
+
+
+def discrete_bottleneck(x,
+                        hidden_size,
+                        z_size,
+                        filter_size,
+                        name,
+                        mode=None,
+                        startup_steps=50000,
+                        bottleneck_kind='dvq',
+                        num_blocks=2,
+                        reshape_method='slice',
+                        projection_tensors=None,
+                        means=None,
+                        beta=0.25,
+                        noise_dev=1.,
+                        decay=0.999,
+                        discrete_mix=0.5,
+                        random_top_k=1,
+                        epsilon=1e-5,
+                        softmax_k=0,
+                        kl_warmup_steps=150000,
+                        ema=True,
+                        ema_count=None,
+                        ema_means=None,
+                        summary=True,
+                        dp_strength=1.0,
+                        dp_decay=1.0,
+                        dp_alpha=0.5):
+  """Discretization bottleneck for latent variables.
+
+  Args:
+    x: Input to the discretization bottleneck.
+    hidden_size: Dimension of the latent state.
+    z_size: Number of bits used to produce discrete code; discrete codes range
+      from 1 to 2**z_size.
+    filter_size: Filter size to be used for the embedding function.
+    name: Name for the bottleneck scope.
+    mode: Mode represents whether we are training or testing for bottlenecks
+      that differ in behavior (Default: None).
+    startup_steps: Number of steps after which latent predictor is trained
+      (Default: 50000).
+    bottleneck_kind: Kind of discretization bottleneck to use; one of dvq,
+      semhash, gumbel-softmax (Default: dvq).
+    num_blocks: Number of blocks to use for decomposed vector quantization.
+    reshape_method: Method to reshape for DVQ (Default: slice).
+    projection_tensors: If the reshape method is project, then these are the
+      tensors used to project (Default: None).
+    means: The embedding table for dvq (Default: None).
+    beta: Beta factor for the DVQ loss (Default: 0.25).
+    noise_dev: Stddev for noise added for semhash (Default: 0).
+    decay: Decay factor for the exponential moving average (Default: 0.999).
+    discrete_mix: Factor for mixing discrete and non-discrete input for semhash
+      (Default: 0.5).
+    random_top_k: Noisy top-k for DVQ (Default: 1).
+    epsilon: Epsilon parameter for DVQ (Default: 1e-5).
+    softmax_k: If > 1 then do top-k softmax (Default: 0).
+    kl_warmup_steps: Number of steps for kl warmup (Default: 150000).
+    ema: If True update embeddings using exponential moving averages (Default:
+      True).
+    ema_count: Table of counts for each embedding corresponding to how many
+      examples in a batch it was the closest to (Default: None).
+    ema_means: Exponentially averaged version of the embeddings (Default: None).
+    summary: If True, then write summaries (Default: True).
+    dp_strength: Strength of Dirichlet Process loss prior (Default: 1.0).
+    dp_decay: Decay the dp_strength using an exponential decay using this
+      term (Default: 1.0).
+    dp_alpha: Alpha term (pseudo-count) in Dirichlet Process (Default: 0.5).
+
+  Returns:
+    Embedding to pass to the decoder, discrete latent, loss, and the embedding
+    function.
+
+  Raises:
+    ValueError: If projection_tensors is None for reshape_method project, or
+    ema_count or ema_means is None if we are using ema, or unknown args.
+  """
+  block_v_size = None
+  if bottleneck_kind == 'dvq':
+    # Define the dvq parameters
+    assert means is not None
+
+    # Check block dimensions add up
+    if hidden_size % num_blocks != 0:
+      raise ValueError('num_blocks does not divide hidden size')
+
+    if 2**z_size % num_blocks != 0:
+      raise ValueError('num_blocks does not divide embedding table size')
+
+    block_v_size = 2**(z_size / num_blocks)
+    block_v_size = int(block_v_size)
+
+    # Set the reshape method corresponding to projections or slices
+    if reshape_method == 'slice':
+      reshape_fn = partial(
+          slice_hidden, hidden_size=hidden_size, num_blocks=num_blocks)
+    elif reshape_method == 'project':
+      if projection_tensors is None:
+        raise ValueError(
+            'Projection tensors is None for reshape_method project')
+      reshape_fn = partial(
+          project_hidden,
+          projection_tensors=projection_tensors,
+          hidden_size=hidden_size,
+          num_blocks=num_blocks)
+    else:
+      raise ValueError('Unknown reshape_method')
+
+    # Check if the ema settings make sense
+    if ema:
+      if ema_count is None:
+        raise ValueError('ema_count is None but ema is True')
+      if ema_means is None:
+        raise ValueError('ema_means is None but ema is True')
+
+  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
+    l = tf.constant(0.0)
+    if bottleneck_kind == 'dense':
+      c = tf.layers.dense(x, z_size, name='vcc')
+      h1 = tf.layers.dense(c, filter_size, name='vch1')
+    elif bottleneck_kind == 'vae':
+      c, l, _, _ = vae(x, z_size, 'vae')
+      h1 = tf.layers.dense(c, filter_size, name='vch1')
+    elif bottleneck_kind == 'semhash':
+      c = tf.layers.dense(x, z_size, name='vcc')
+      y_clean = common_layers.saturating_sigmoid(c)
+      if summary:
+        tf.summary.histogram('y_clean', tf.reshape(y_clean, [-1]))
+      if noise_dev > 0 and mode == tf.estimator.ModeKeys.TRAIN:
+        noise = tf.truncated_normal(
+            common_layers.shape_list(c), mean=0.0, stddev=noise_dev)
+        y = common_layers.saturating_sigmoid(c + noise)
+      else:
+        y = y_clean
+      d = tf.to_float(tf.less(0.5, y))
+      y_discrete = tf.stop_gradient(d) + y - tf.stop_gradient(y)
+      pd = common_layers.inverse_exp_decay(startup_steps * 2)
+      pd *= discrete_mix
+      pd = pd if mode == tf.estimator.ModeKeys.TRAIN else 1.0
+      c = tf.where(
+          tf.less(tf.random_uniform([common_layers.shape_list(y)[0]]), pd),
+          y_discrete, y)
+      h1a = tf.layers.dense(c, filter_size, name='vch1a')
+      h1b = tf.layers.dense(1.0 - c, filter_size, name='vch1b')
+      h1 = h1a + h1b
+      dx = tf.to_int32(tf.stop_gradient(d))
+      c = bit_to_int(dx, z_size)
+    elif bottleneck_kind == 'gumbel-softmax':
+      _, hot, l = gumbel_softmax(x, name, z_size, mode, softmax_k,
+                                 kl_warmup_steps, summary)
+      c = tf.argmax(hot, axis=-1)
+      h1 = tf.layers.dense(hot, hidden_size, name='dae_dense')
+    elif bottleneck_kind == 'dvq':
+      x_reshaped = reshape_fn(x)
+      x_means_hot, x_means, q_loss, e_loss = embedding_lookup(
+          x_reshaped, means, num_blocks, block_v_size, random_top_k)
+
+      # Get the discrete latent represenation
+      x_means_idx = tf.argmax(x_means_hot, axis=-1)
+
+      # Get the binary representation
+      x_means_bits = int_to_bit(
+          x_means_idx, num_bits=int(z_size / num_blocks), base=2)
+      shape = common_layers.shape_list(x_means_bits)
+      new_shape = shape[:-1]
+      new_shape[-1] = z_size
+      x_means_bits = tf.reshape(x_means_bits, shape=new_shape)
+      c = bit_to_int(tf.to_int32(x_means_bits), num_bits=z_size, base=2)
+
+      # Adjust shape of c
+      shape_x = common_layers.shape_list(x)
+      new_shape = shape_x[:-1]
+      c = tf.reshape(c, new_shape)
+
+      # Update the ema variables
+      if ema:
+        tf.logging.info('Using EMA with beta = {}'.format(beta))
+        updated_ema_count = moving_averages.assign_moving_average(
+            ema_count,
+            tf.reduce_sum(
+                tf.reshape(x_means_hot, shape=[-1, num_blocks, block_v_size]),
+                axis=0),
+            decay,
+            zero_debias=False)
+
+        # Adding a term that puts a Dirichlet prior over cluster probabilities
+        # Hopefully it'll encourage rich get richer behaviors
+        dp_prior_loss = 0.
+        if dp_strength > 0.0:
+          # Decay dp_strength over time to make it less important
+          dp_strength = tf.train.exponential_decay(
+              dp_strength,
+              global_step=tf.to_int32(tf.train.get_global_step()),
+              decay_steps=20000,
+              decay_rate=dp_decay)
+          dp_count = ema_count + dp_alpha
+          p = dp_count / tf.reduce_sum(dp_count, 1, keepdims=True)
+          dp_prior_loss = tf.log(p)
+          dp_prior_loss = -1.0 * tf.reduce_sum(dp_prior_loss)
+          dp_prior_loss /= (num_blocks * block_v_size)
+
+        x_means_hot_flat = tf.reshape(
+            x_means_hot, shape=[-1, num_blocks, block_v_size])
+        dw = tf.matmul(
+            tf.transpose(x_means_hot_flat, perm=[1, 2, 0]),
+            tf.transpose(x_reshaped, perm=[1, 0, 2]))
+        updated_ema_means = moving_averages.assign_moving_average(
+            ema_means, dw, decay, zero_debias=False)
+        n = tf.reduce_sum(updated_ema_count, axis=-1, keep_dims=True)
+        updated_ema_count = ((updated_ema_count + epsilon) /
+                             (n + 2**z_size * epsilon) * n)
+        updated_ema_means /= tf.expand_dims(updated_ema_count, axis=-1)
+
+        with tf.control_dependencies([e_loss]):
+          update_means = tf.assign(means, updated_ema_means)
+          with tf.control_dependencies([update_means]):
+            l = beta * e_loss + dp_strength * dp_prior_loss
+      else:
+        l = q_loss + beta * e_loss
+
+      x_means = tf.reshape(x_means, shape_x)
+      x_reshaped = tf.reshape(x_reshaped, shape_x)
+      h1 = x_reshaped + tf.stop_gradient(x_means - x_reshaped)
+    else:
+      raise ValueError('Unknown discretization method.')
+
+    h2 = tf.layers.dense(tf.nn.relu(h1), filter_size, name='vch2')
+    res = tf.layers.dense(tf.nn.relu(h2), hidden_size, name='vcfin')
+
+    embed_fn = partial(
+        embed,
+        hidden_size=hidden_size,
+        z_size=z_size,
+        filter_size=filter_size,
+        name=name,
+        bottleneck_kind=bottleneck_kind,
+        num_blocks=num_blocks,
+        block_v_size=block_v_size,
+        means=means)
+    return res, c, l, embed_fn
diff --git a/tensor2tensor/layers/discretization_test.py b/tensor2tensor/layers/discretization_test.py
new file mode 100644
index 000000000..74eb3d6fb
--- /dev/null
+++ b/tensor2tensor/layers/discretization_test.py
@@ -0,0 +1,125 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensor2tensor.layers.discretization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# Dependency imports
+import numpy as np
+from tensor2tensor.layers import discretization
+import tensorflow as tf
+
+
+class DiscretizationTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.set_random_seed(1234)
+    np.random.seed(123)
+
+  def testBitToIntZeros(self):
+    x_bit = tf.zeros(shape=[1, 10], dtype=tf.float32)
+    x_int = tf.zeros(shape=[1], dtype=tf.int32)
+    diff = discretization.bit_to_int(x_bit, num_bits=10) - x_int
+    with self.test_session() as sess:
+      tf.global_variables_initializer().run()
+      d = sess.run(diff)
+      self.assertEqual(d, 0)
+
+  def testBitToIntOnes(self):
+    x_bit = tf.ones(shape=[1, 3], dtype=tf.float32)
+    x_int = 7 * tf.ones(shape=[1], dtype=tf.int32)
+    diff = discretization.bit_to_int(x_bit, num_bits=3) - x_int
+    with self.test_session() as sess:
+      tf.global_variables_initializer().run()
+      d = sess.run(diff)
+      self.assertEqual(d, 0)
+
+  def testIntToBitZeros(self):
+    x_bit = tf.zeros(shape=[1, 10], dtype=tf.float32)
+    x_int = tf.zeros(shape=[1], dtype=tf.int32)
+    diff = discretization.int_to_bit(x_int, num_bits=10) - x_bit
+    with self.test_session() as sess:
+      tf.global_variables_initializer().run()
+      d = sess.run(diff)
+      self.assertTrue(np.all(d == 0))
+
+  def testIntToBitOnes(self):
+    x_bit = tf.ones(shape=[1, 3], dtype=tf.float32)
+    x_int = 7 * tf.ones(shape=[1], dtype=tf.int32)
+    diff = discretization.int_to_bit(x_int, num_bits=3) - x_bit
+    with self.test_session() as sess:
+      tf.global_variables_initializer().run()
+      d = sess.run(diff)
+      self.assertTrue(np.all(d == 0))
+
+  def testProjectHidden(self):
+    hidden_size = 60
+    block_dim = 20
+    num_blocks = 3
+    x = tf.zeros(shape=[1, hidden_size], dtype=tf.float32)
+    projection_tensors = tf.random_normal(
+        shape=[num_blocks, hidden_size, block_dim], dtype=tf.float32)
+    x_projected = discretization.project_hidden(x, projection_tensors,
+                                                hidden_size, num_blocks)
+    with self.test_session() as sess:
+      tf.global_variables_initializer().run()
+      x_projected_eval = sess.run(x_projected)
+      self.assertEqual(np.shape(x_projected_eval), (1, num_blocks, block_dim))
+      self.assertTrue(np.all(x_projected_eval == 0))
+
+  def testSliceHiddenZeros(self):
+    hidden_size = 60
+    block_dim = 20
+    num_blocks = 3
+    x = tf.zeros(shape=[1, hidden_size], dtype=tf.float32)
+    x_sliced = discretization.slice_hidden(x, hidden_size, num_blocks)
+    with self.test_session() as sess:
+      tf.global_variables_initializer().run()
+      x_sliced_eval = sess.run(x_sliced)
+      self.assertEqual(np.shape(x_sliced_eval), (1, num_blocks, block_dim))
+      self.assertTrue(np.all(x_sliced_eval == 0))
+
+  def testSliceHiddenOnes(self):
+    hidden_size = 60
+    block_dim = 20
+    num_blocks = 3
+    x = tf.ones(shape=[1, hidden_size], dtype=tf.float32)
+    x_sliced = discretization.slice_hidden(x, hidden_size, num_blocks)
+    with self.test_session() as sess:
+      tf.global_variables_initializer().run()
+      x_sliced_eval = sess.run(x_sliced)
+      self.assertEqual(np.shape(x_sliced_eval), (1, num_blocks, block_dim))
+      self.assertTrue(np.all(x_sliced_eval == 1))
+
+  def testNearestNeighbors(self):
+    x = tf.constant([[0, 0.9, 0], [0.8, 0., 0.]], dtype=tf.float32)
+    x = tf.expand_dims(x, axis=0)
+    means = tf.constant(
+        [[1, 0, 0], [0, 1, 0], [0, 0, 1], [9, 9, 9]], dtype=tf.float32)
+    means = tf.stack([means, means], axis=0)
+    x_means_hot = discretization.nearest_neighbor(x, means, block_v_size=4)
+    x_means_hot_test = np.array([[0, 1, 0, 0], [1, 0, 0, 0]])
+    x_means_hot_test = np.expand_dims(x_means_hot_test, axis=0)
+    with self.test_session() as sess:
+      tf.global_variables_initializer().run()
+      x_means_hot_eval = sess.run(x_means_hot)
+      self.assertEqual(np.shape(x_means_hot_eval), (1, 2, 4))
+      self.assertTrue(np.all(x_means_hot_eval == x_means_hot_test))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index 3c54fa339..478e3284f 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -542,3 +542,8 @@ def top(self, body_output, _):
   def targets_bottom(self, x):
     """SymbolModality overrides targets_bottom, so need to override here too."""
     return self.bottom(x)
+
+  @property
+  def top_is_pointwise(self):
+    # pointwise mode manipulates body output, not logits, so it fails here.
+    return False
diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py
index df7744ff9..c78d1f52a 100644
--- a/tensor2tensor/models/__init__.py
+++ b/tensor2tensor/models/__init__.py
@@ -37,6 +37,7 @@
 from tensor2tensor.models import vanilla_gan
 from tensor2tensor.models import xception
 
+from tensor2tensor.models.research import adafactor_experiments
 from tensor2tensor.models.research import aligned
 from tensor2tensor.models.research import attention_lm
 from tensor2tensor.models.research import attention_lm_moe
@@ -47,5 +48,6 @@
 from tensor2tensor.models.research import transformer_moe
 from tensor2tensor.models.research import transformer_revnet
 from tensor2tensor.models.research import transformer_sketch
+from tensor2tensor.models.research import transformer_symshard
 from tensor2tensor.models.research import transformer_vae
 # pylint: enable=unused-import
diff --git a/tensor2tensor/models/image_transformer_2d.py b/tensor2tensor/models/image_transformer_2d.py
index 5ab0d112b..046fa06ee 100644
--- a/tensor2tensor/models/image_transformer_2d.py
+++ b/tensor2tensor/models/image_transformer_2d.py
@@ -424,6 +424,35 @@ def imagetransformer2d_tiny():
   return hparams
 
 
+def update_hparams_for_tpu(hparams):
+  hparams.use_pad_remover = False  # where op not supported
+  hparams.optimizer = "TrueAdam"
+  hparams.batch_size = 4
+
+
+@registry.register_hparams
+def img2mg_transformer_base_tpu():
+  """Hparams for training img2img_transformer on tpu."""
+  hparams = img2img_transformer_base()
+  update_hparams_for_tpu(hparams)
+  hparams.batch_size = 4
+  hparams.num_heads = 4   # heads are expensive on tpu
+  hparams.num_decoder_layers = 8
+  hparams.num_encoder_layers = 4
+  hparams.shared_embedding_and_softmax_weights = False
+  return hparams
+
+
+@registry.register_hparams
+def img2mg_transformer_tiny_tpu():
+  hparams = img2mg_transformer_base_tpu()
+  hparams.num_hidden_layers = 2
+  hparams.hidden_size = 16
+  hparams.batch_size = 2
+  hparams.num_heads = 2
+  return hparams
+
+
 @registry.register_hparams
 def img2img_transformer2d_n3():
   hparams = img2img_transformer2d_base()
diff --git a/tensor2tensor/models/research/adafactor_experiments.py b/tensor2tensor/models/research/adafactor_experiments.py
new file mode 100644
index 000000000..d7031dee2
--- /dev/null
+++ b/tensor2tensor/models/research/adafactor_experiments.py
@@ -0,0 +1,173 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Experiments with Adafactor.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensor2tensor.models import transformer
+from tensor2tensor.utils import registry
+
+
+def mimic_adam_with_adafactor(hparams):
+  """Switch from Adam to Adafactor, approximating the behavior of Adam.
+
+  Some minor things may be different, like epsilon and beta1 correction.
+
+  Args:
+    hparams: model hyperparameters where "Adam" in hparams.optimizer
+  """
+  assert "Adam" in hparams.optimizer
+  hparams.optimizer = "Adafactor"
+  hparams.optimizer_adafactor_beta1 = hparams.optimizer_adam_beta1
+  hparams.optimizer_adafactor_beta2 = hparams.optimizer_adam_beta2
+  hparams.optimizer_adafactor_multiply_by_parameter_scale = False
+  hparams.optimizer_adafactor_factored = False
+  hparams.optimizer_adafactor_clipping_threshold = None
+  hparams.optimizer_adafactor_decay_type = "Adam"
+
+
+@registry.register_hparams
+def afx_adam():
+  """Old version - Adam."""
+  hparams = transformer.transformer_base_v2()
+  hparams.optimizer_adam_beta1 = 0.9
+  hparams.optimizer_adam_beta2 = 0.999
+  hparams.symbol_modality_num_shards = 1
+  hparams.batch_size = 2048
+  hparams.optimizer = "Adam"
+  hparams.learning_rate_schedule = (
+      "constant*rsqrt_decay*linear_warmup*rsqrt_hidden_size")
+  hparams.learning_rate_constant = 2.0
+  return hparams
+
+
+@registry.register_hparams
+def afx_mimic_adam():
+  """Emulating Adam - should be very similar to afx_adam."""
+  hparams = afx_adam()
+  mimic_adam_with_adafactor(hparams)
+  return hparams
+
+
+@registry.register_hparams
+def afx_base():
+  """Baseline - no momentum, beta=0.999."""
+  hparams = afx_mimic_adam()
+  hparams.optimizer_adafactor_beta1 = 0.0
+  return hparams
+
+
+@registry.register_hparams
+def afx_factored():
+  hparams = afx_base()
+  hparams.optimizer_adafactor_factored = True
+  return hparams
+
+
+@registry.register_hparams
+def afx_fast():
+  hparams = afx_base()
+  hparams.optimizer_adafactor_beta2 = 0.9
+  return hparams
+
+
+@registry.register_hparams
+def afx_clip():
+  hparams = afx_base()
+  hparams.optimizer_adafactor_clipping_threshold = 1.0
+  return hparams
+
+
+@registry.register_hparams
+def afx_clip2():
+  hparams = afx_base()
+  hparams.optimizer_adafactor_clipping_threshold = 2.0
+  return hparams
+
+
+@registry.register_hparams
+def afx_clip_factored():
+  hparams = afx_clip()
+  hparams.optimizer_adafactor_factored = True
+  return hparams
+
+
+@registry.register_hparams
+def afx_pow05():
+  hparams = afx_base()
+  hparams.optimizer_adafactor_decay_type = "pow"
+  hparams.optimizer_adafactor_memory_exponent = 0.5
+  return hparams
+
+
+@registry.register_hparams
+def afx_pow08():
+  hparams = afx_pow05()
+  hparams.optimizer_adafactor_memory_exponent = 0.8
+  return hparams
+
+
+@registry.register_hparams
+def afx_pow10():
+  hparams = afx_pow05()
+  hparams.optimizer_adafactor_memory_exponent = 1.0
+  return hparams
+
+
+@registry.register_hparams
+def afx_pow08_clip():
+  hparams = afx_pow08()
+  hparams.optimizer_adafactor_clipping_threshold = 1.0
+  return hparams
+
+
+@registry.register_hparams
+def afx_relative():
+  hparams = afx_base()
+  hparams.optimizer_adafactor_multiply_by_parameter_scale = True
+  hparams.learning_rate_schedule = "rsqrt_decay"
+  hparams.learning_rate_warmup_steps = 10000
+  return hparams
+
+
+@registry.register_hparams
+def afx_unscale():
+  hparams = afx_base()
+  hparams.shared_embedding_and_softmax_weights = False
+  hparams.multiply_embedding_mode = "none"
+  return hparams
+
+
+@registry.register_hparams
+def afx_unscale_relative():
+  hparams = afx_unscale()
+  hparams.optimizer_adafactor_multiply_by_parameter_scale = True
+  hparams.learning_rate_schedule = "rsqrt_decay"
+  hparams.learning_rate_warmup_steps = 10000
+  return hparams
+
+
+@registry.register_hparams
+def afx_adafactor():
+  """Adafactor with recommended learning rate schedule."""
+  hparams = afx_adam()
+  hparams.optimizer = "Adafactor"
+  hparams.learning_rate_schedule = "rsqrt_decay"
+  hparams.learning_rate_warmup_steps = 10000
+  return hparams
diff --git a/tensor2tensor/models/research/transformer_symshard.py b/tensor2tensor/models/research/transformer_symshard.py
new file mode 100644
index 000000000..64b9fed97
--- /dev/null
+++ b/tensor2tensor/models/research/transformer_symshard.py
@@ -0,0 +1,416 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test of the SymShard programming model.
+
+Symmetric model parallellism.
+
+Each shard (device) has a similar structure with different weights.
+Occasional allreduce (sum) across shards.
+
+On TPU, we replicate the whole model on each core.  This is not the intended
+use, but we can test the model quality.
+
+Example problem: translate_ende_8k_packed
+
+Preliminary results on languagemodel_lm1b8k_packed (200k steps 8 cores)
+  transformer_tpu:             48M params   dev-log-ppl=-1.29   dev-BLEU=27.0
+  transformer_symshard_sh4:    49M params   dev-log-ppl=-1.30   dev-BLEU=26.4
+  transformer_symshard_base:   98M params   dev-log-ppl=-1.23   dev-BLEU=27.6
+
+  transformer_symshard_base with different mixing fraction (default=0.5):
+    mix_fraction=0.0    dev-log-ppl=-1.33
+    mix_fraction=0.25   dev-log-ppl=-1.23
+    mix_fraction=0.5    dev-log-ppl=-1.23
+    mix_fraction=0.75   dev-log-ppl=-1.24
+    mix_fraction=1.0    dev-log-ppl=-1.28
+
+TODO(noam): Make sure no one is using super_lm, then delete it.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
+from tensor2tensor.utils import expert_utils
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+@registry.register_model
+class TransformerSymshard(t2t_model.T2TModel):
+  """See file docstring."""
+
+  def body(self, features):
+    hparams = self._hparams
+    ps_devices = self._ps_devices
+    single_device = (len(ps_devices) == 1)
+    assert hparams.num_model_shards % len(ps_devices) == 0
+    shards_per_device = hparams.num_model_shards // len(ps_devices)
+    model_devices = [ps_devices[i // shards_per_device]
+                     for i in xrange(hparams.num_model_shards)]
+    print("model_devices = %s" % model_devices)
+    mp = expert_utils.Parallelism(model_devices, reuse=False)
+    targets_vocab_size = self._problem_hparams.vocabulary["targets"].vocab_size
+    # squeeze out channels, heights
+    targets = tf.squeeze(features["targets_raw"], [2, 3])
+    targets_embedding_var = mp(
+        tf.get_variable, "embedding",
+        [[targets_vocab_size, hparams.hidden_size]] * mp.n,
+        initializer=tf.random_normal_initializer(
+            0.0, hparams.hidden_size**-0.5))
+    shifted_targets = common_layers.shift_right_2d(targets)
+    # Bypass the symbol modality and use a different embedding on each shard.
+    if single_device:
+      targets_embedding_var_combined = tf.concat(targets_embedding_var, 1)
+      decoder_input_combined = common_layers.embedding(
+          shifted_targets, targets_vocab_size,
+          hparams.hidden_size * mp.n,
+          multiplier=hparams.hidden_size**0.5,
+          embedding_var=targets_embedding_var_combined,
+      )
+      decoder_input = tf.split(decoder_input_combined, mp.n, axis=2)
+    else:
+      targets_embedding_var_combined = None
+      decoder_input = mp(
+          common_layers.embedding, shifted_targets, targets_vocab_size,
+          hparams.hidden_size,
+          multiplier=hparams.hidden_size**0.5,
+          embedding_var=targets_embedding_var,
+      )
+    decoder_self_attention_bias = mp(
+        common_attention.attention_bias_lower_triangle,
+        tf.shape(targets)[1])
+    if "targets_segmentation" in features:
+      # "Packed" dataset - keep the examples from seeing each other.
+      targets_segmentation = features["targets_segmentation"]
+      targets_position = features["targets_position"]
+      decoder_self_attention_bias = mp(
+          tf.add, decoder_self_attention_bias,
+          mp(common_attention.attention_bias_same_segment,
+             targets_segmentation, targets_segmentation))
+      decoder_input = mp(
+          common_attention.add_timing_signal_1d_given_position,
+          decoder_input, targets_position)
+    else:
+      targets_position = None
+      decoder_self_attention_bias = mp(
+          common_attention.attention_bias_lower_triangle,
+          tf.shape(targets)[1])
+      decoder_input = mp(common_attention.add_timing_signal_1d, decoder_input)
+
+    if self.has_input:
+      inputs = tf.squeeze(features["inputs_raw"], [2, 3])
+      inputs_vocab_size = self._problem_hparams.vocabulary["inputs"].vocab_size
+      # share everything for now
+      share_inputs_and_targets_embedding = True
+      if share_inputs_and_targets_embedding:
+        assert inputs_vocab_size == targets_vocab_size
+        inputs_embedding_var = targets_embedding_var
+        inputs_embedding_var_combined = targets_embedding_var_combined
+      if single_device:
+        encoder_input_combined = common_layers.embedding(
+            inputs, inputs_vocab_size,
+            hparams.hidden_size * mp.n,
+            multiplier=hparams.hidden_size**0.5,
+            embedding_var=inputs_embedding_var_combined,
+        )
+        encoder_input = tf.split(encoder_input_combined, mp.n, axis=2)
+      else:
+        encoder_input = mp(
+            common_layers.embedding, inputs, inputs_vocab_size,
+            hparams.hidden_size,
+            multiplier=hparams.hidden_size**0.5,
+            embedding_var=inputs_embedding_var,
+        )
+      if "inputs_segmentation" in features:
+        # "Packed" dataset - keep the examples from seeing each other.
+        inputs_segmentation = features["inputs_segmentation"]
+        inputs_position = features["inputs_position"]
+        encoder_self_attention_bias = mp(
+            common_attention.attention_bias_same_segment,
+            inputs_segmentation, inputs_segmentation)
+        encoder_decoder_attention_bias = mp(
+            common_attention.attention_bias_same_segment,
+            targets_segmentation, inputs_segmentation)
+        encoder_input = mp(
+            common_attention.add_timing_signal_1d_given_position,
+            encoder_input, inputs_position)
+      else:
+        encoder_padding = tf.to_float(tf.equal(inputs, 0))
+        ignore_padding = common_attention.attention_bias_ignore_padding(
+            encoder_padding)
+        encoder_self_attention_bias = ignore_padding
+        encoder_decoder_attention_bias = ignore_padding
+        inputs_position = None
+        encoder_input = mp(common_attention.add_timing_signal_1d, encoder_input)
+
+      # encoder stack here
+      with tf.variable_scope("encoder"):
+        encoder_input = mp(
+            tf.nn.dropout, encoder_input,
+            1.0 - hparams.layer_prepostprocess_dropout)
+        encoder_output = _layer_stack(
+            mp,
+            encoder_input,
+            encoder_self_attention_bias,
+            hparams.encoder_layers,
+            hparams)
+    else:
+      encoder_decoder_attention_bias = None
+      encoder_output = None
+
+    with tf.variable_scope("decoder"):
+      decoder_input = mp(
+          tf.nn.dropout, decoder_input,
+          1.0 - hparams.layer_prepostprocess_dropout)
+      decoder_output = _layer_stack(
+          mp,
+          decoder_input,
+          decoder_self_attention_bias,
+          layers=hparams.decoder_layers,
+          hparams=hparams,
+          encoder_output=encoder_output,
+          encoder_decoder_attention_bias=encoder_decoder_attention_bias)
+
+    # Bypass the symbol modality and compute logits directly.
+    # We compute a different set of logits on each shard, and sum them.
+    # Share the weights with the target embedding.
+    output_var = targets_embedding_var
+    output_var_combined = targets_embedding_var_combined
+    if single_device:
+      decoder_output = tf.concat(decoder_output, 2)
+      logits = tf.tensordot(decoder_output, output_var_combined, [[2], [1]])
+      num, denom = common_layers.padded_cross_entropy(
+          logits, targets, hparams.label_smoothing)
+      training_loss = num / denom
+    else:
+      logits = mp(
+          tf.tensordot, decoder_output, output_var, [[[2], [1]]] * mp.n)
+      logits = common_layers.all_reduce_ring(logits, mp)
+      # On each device, we compute the loss for a part of the batch.
+      # This is faster than computing the whole loss on one shard.
+      mp, logits = common_layers.reduce_by_device(mp, logits, lambda l: l[0])
+      def _loss_for_shard(logits, targets, shard):
+        logits = common_layers.approximate_split(logits, mp.n, 0)[shard]
+        targets = common_layers.approximate_split(targets, mp.n, 0)[shard]
+        return common_layers.padded_cross_entropy(
+            logits, targets, hparams.label_smoothing)
+      num, denom = mp(_loss_for_shard, logits, targets, range(mp.n))
+      training_loss = tf.add_n(num) / tf.add_n(denom)
+      logits = logits[0]
+    logits = tf.expand_dims(tf.expand_dims(logits, 2), 3)
+    # override training loss so that it is not computed externally.
+    losses = {"training": training_loss}
+    return logits, losses
+
+
+def _layer_stack(mp,
+                 inputs,
+                 self_attention_bias,
+                 layers,
+                 hparams,
+                 encoder_output=None,
+                 encoder_decoder_attention_bias=None):
+  """A stack of layers.
+
+  Args:
+    mp: a Parallelism object
+    inputs: a list of Tensors
+    self_attention_bias: list of bias Tensor for self-attention
+      (see common_attention.attention_bias())
+    layers: a string
+    hparams: hyperparameters for model
+    encoder_output: optional list of tensors
+    encoder_decoder_attention_bias: optional list of tensors
+
+  Returns:
+    y: a list of Tensors
+  """
+  layers = layers.strip(",").split(",")
+
+  # scaled_dot_product_attention_with_projections uses a 3d attention bias
+  # (no heads), where multihead_attention uses 4d attention bias.
+  self_attention_bias_3d = mp(tf.squeeze, self_attention_bias, 1)
+  if encoder_decoder_attention_bias is not None:
+    encoder_decoder_attention_bias_3d = mp(
+        tf.squeeze, encoder_decoder_attention_bias, 1)
+  relu_dropout_broadcast_dims = (
+      common_layers.comma_separated_string_to_integer_list(
+          getattr(hparams, "relu_dropout_broadcast_dims", "")))
+  mix_size = int(hparams.mix_fraction * hparams.hidden_size)
+  accumulator = inputs
+  x = inputs
+  for layer_num, layer_type in enumerate(layers):
+    with tf.variable_scope("%s_%d" % (layer_type, layer_num)):
+      tf.logging.info("%s_%d" % (layer_type, layer_num))
+      if layer_type == "a":
+        # accumulate
+        accumulator = mp(tf.add, x, accumulator)
+        x = accumulator
+      elif layer_type == "n":
+        # normalize
+        x = mp(common_layers.apply_norm,
+               x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon)
+      elif layer_type == "d":
+        # dropout
+        x = mp(tf.nn.dropout, x, 1.0 - hparams.layer_prepostprocess_dropout)
+      elif layer_type == "m":
+        if mix_size > 0:
+          # mix across shards
+          def _split(t):
+            return tuple(tf.split(
+                t, [mix_size, hparams.hidden_size - mix_size], 2))
+          to_mix, to_keep = mp(_split, x)
+          mixed = common_layers.all_reduce_ring(to_mix, mp)
+          mixed = mp(tf.multiply, mixed, mp.n ** -0.5)
+          x = mp(lambda a, b: tf.concat([a, b], 2), mixed, to_keep)
+      elif layer_type == "att":
+        # single-head attention
+        q = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False,
+               name="q_transform")
+        x = mp(
+            common_attention.scaled_dot_product_attention_simple,
+            q, x, x, self_attention_bias_3d)
+        x = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False,
+               name="o_transform")
+      elif layer_type == "enc-att":
+        # single-head attention over encoder
+        q = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False,
+               name="q_transform")
+        assert encoder_output is not None
+        x = mp(
+            common_attention.scaled_dot_product_attention_simple,
+            q, encoder_output, encoder_output,
+            encoder_decoder_attention_bias_3d)
+        x = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False,
+               name="o_transform")
+      elif layer_type == "multihead-att":
+        # multi-head attention
+        x = mp(
+            common_attention.multihead_attention,
+            x,
+            None,
+            self_attention_bias,  # bias
+            hparams.multihead_attention_key_channels or hparams.hidden_size,
+            hparams.multihead_attention_value_channels or hparams.hidden_size,
+            hparams.hidden_size,
+            hparams.multihead_attention_num_heads,
+            hparams.attention_dropout)
+      elif layer_type == "enc-multihead-att":
+        # multi-head attention
+        x = mp(
+            common_attention.multihead_attention,
+            x,
+            encoder_output,
+            encoder_decoder_attention_bias,  # bias
+            hparams.multihead_attention_key_channels or hparams.hidden_size,
+            hparams.multihead_attention_value_channels or hparams.hidden_size,
+            hparams.hidden_size,
+            hparams.multihead_attention_num_heads,
+            hparams.attention_dropout)
+      elif layer_type == "ffn":
+        x = mp(
+            common_layers.dense_relu_dense, x,
+            hparams.filter_size, hparams.hidden_size,
+            dropout=hparams.relu_dropout,
+            dropout_broadcast_dims=[relu_dropout_broadcast_dims] * mp.n)
+      else:
+        assert False, "unknown sublayer %s" % layer_type
+  return x
+
+
+@registry.register_hparams
+def transformer_symshard_base():
+  """Set of hyperparameters."""
+  hparams = common_hparams.basic_params1()
+  hparams.hidden_size = 256
+  hparams.batch_size = 2048
+  hparams.max_length = 0
+  # All hyperparameters ending in "dropout" are automatically set to 0.0
+  # when not in training mode.
+  hparams.layer_prepostprocess_dropout = 0.2
+  hparams.add_hparam("attention_dropout", 0.1)
+  hparams.add_hparam("relu_dropout", 0.0)
+  hparams.add_hparam("relu_dropout_broadcast_dims", "1")
+  hparams.layer_prepostprocess_dropout = 0.1
+  hparams.layer_prepostprocess_dropout_broadcast_dims = "1"  # length
+  hparams.label_smoothing = 0.1
+  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
+  hparams.optimizer = "Adafactor"
+  hparams.learning_rate_schedule = "rsqrt_decay"
+  hparams.learning_rate_warmup_steps = 10000
+  hparams.initializer_gain = 1.0
+  hparams.initializer = "uniform_unit_scaling"
+  hparams.weight_decay = 0.0
+  # TODO(noam): use this to control sharing.  We now share always
+  hparams.shared_embedding_and_softmax_weights = True
+  # we only want one data shard.
+  hparams.no_data_parallelism = True
+  # bypass the symbol modality so that we can use model parallelism.
+  hparams.target_modality = "symbol:identity"
+  hparams.input_modalities = "inputs:symbol:identity"
+  hparams.add_hparam("filter_size", 1280)
+  hparams.add_hparam("mix_fraction", 0.5)
+  # attention-related flags
+  hparams.add_hparam("multihead_attention_num_heads", 4)
+  hparams.add_hparam("multihead_attention_key_channels", 0)
+  hparams.add_hparam("multihead_attention_value_channels", 0)
+  hparams.add_hparam("pos", "timing")  # timing, none
+  hparams.add_hparam(
+      "encoder_layers", ("n,att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d")
+  hparams.add_hparam(
+      "decoder_layers",
+      ("n,att,m,d,a," "n,enc-att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d")
+  # Number of model shards - each one has separate parameters.
+  # Changing this number invalidates checkpoints.
+  hparams.add_hparam("num_model_shards", 8)
+  return hparams
+
+
+@registry.register_hparams
+def transformer_symshard_sh4():
+  """4 shards instead of 8.  Similar model size to transformer_tpu()."""
+  hparams = transformer_symshard_base()
+  hparams.num_model_shards = 4
+  return hparams
+
+
+@registry.register_hparams
+def transformer_symshard_lm_0():
+  """For language modeling - suggested problem languagemodel_lm1b8k_packed."""
+  hparams = transformer_symshard_base()
+  hparams.label_smoothing = 0
+  return hparams
+
+
+@registry.register_hparams
+def transformer_symshard_h4():
+  """4 heads per shard."""
+  hparams = transformer_symshard_base()
+  hparams.encoder_layers = ("n,multihead-att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d"
+  hparams.decoder_layers = (
+      ("n,multihead-att,m,d,a," "n,enc-multihead-att,m,d,a," "n,ffn,m,d,a,") * 6
+      + "n,d")
+  return hparams
diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py
index 6ad4e19a5..4b37528ea 100644
--- a/tensor2tensor/models/research/transformer_vae.py
+++ b/tensor2tensor/models/research/transformer_vae.py
@@ -18,25 +18,19 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
+from functools import partial
 import math
-
 # Dependency imports
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_image_attention as cia
 from tensor2tensor.layers import common_layers
+from tensor2tensor.layers import discretization
 from tensor2tensor.models import transformer
 from tensor2tensor.utils import beam_search
 from tensor2tensor.utils import expert_utils
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
-
-
 import tensorflow as tf
-from tensorflow.python.training import moving_averages
-
 
 _DO_SUMMARIES = True
 
@@ -110,325 +104,6 @@ def top_k_experts(x, k, hparams):
   return gates, load_loss
 
 
-def gumbel_sample(shape):
-  """Sample from the Gumbel distribution, protect from overflows."""
-  uniform_samples = tf.random_uniform(shape, minval=0.00001, maxval=0.99998)
-  return -tf.log(-tf.log(uniform_samples))
-
-
-def dae(x, hparams, name):
-  with tf.variable_scope(name):
-    m = tf.layers.dense(x, hparams.v_size, name="mask")
-    if hparams.softmax_k > 0:
-      m, kl = top_k_softmax(m, hparams.softmax_k)
-      return m, m, 1.0 - tf.reduce_mean(kl)
-    logsm = tf.nn.log_softmax(m)
-    # Gumbel-softmax sample.
-    gumbel_samples = gumbel_sample(common_layers.shape_list(m))
-    steps = hparams.kl_warmup_steps
-    gumbel_samples *= common_layers.inverse_exp_decay(steps // 5) * 0.5
-    temperature = 1.2 - common_layers.inverse_lin_decay(steps)
-    # 10% of the time keep reasonably high temperature to keep learning.
-    temperature = tf.cond(tf.less(tf.random_uniform([]), 0.9),
-                          lambda: temperature,
-                          lambda: tf.random_uniform([], minval=0.5, maxval=1.0))
-    s = tf.nn.softmax((logsm + gumbel_samples) / temperature)
-    m = tf.nn.softmax(m)
-    kl = - tf.reduce_max(logsm, axis=-1)
-    if _DO_SUMMARIES:
-      tf.summary.histogram("max-log", tf.reshape(kl, [-1]))
-    # Calculate the argmax and construct hot vectors.
-    maxvec = tf.reshape(tf.argmax(m, axis=-1), [-1])
-    maxvhot = tf.stop_gradient(tf.one_hot(maxvec, hparams.v_size))
-    # Add losses that prevent too few being used.
-    distrib = tf.reshape(logsm, [-1, hparams.v_size]) * maxvhot
-    d_mean = tf.reduce_mean(distrib, axis=[0], keep_dims=True)
-    d_variance = tf.reduce_mean(tf.square(distrib - d_mean), axis=[0])
-    d_dev = - tf.reduce_mean(d_variance)
-    ret = s
-    if hparams.mode != tf.contrib.learn.ModeKeys.TRAIN:
-      ret = tf.reshape(maxvhot, common_layers.shape_list(s))  # Just hot @eval.
-    return m, ret, d_dev * 5.0 + tf.reduce_mean(kl) * 0.002
-
-
-def vae(x, z_size, name):
-  with tf.variable_scope(name):
-    mu = tf.layers.dense(x, z_size, name="mu")
-    log_sigma = tf.layers.dense(x, z_size, name="log_sigma")
-    shape = common_layers.shape_list(x)
-    epsilon = tf.random_normal([shape[0], shape[1], 1, z_size])
-    z = mu + tf.exp(log_sigma / 2) * epsilon
-    kl = 0.5 * tf.reduce_mean(
-        tf.exp(log_sigma) + tf.square(mu) - 1. - log_sigma, axis=-1)
-    free_bits = z_size // 4
-    kl_loss = tf.reduce_mean(tf.maximum(kl - free_bits, 0.0))
-    return z, kl_loss, mu, log_sigma
-
-
-def project_hidden(x, hparams):
-  """Project encoder hidden state into block_dim using projection tensors.
-
-  Args:
-    x: Encoder hidden state of shape [-1, hidden_size]
-    hparams: Hparams
-
-  Returns:
-    Projected states of shape [-1, num_blocks, block_dim].
-  """
-  x = tf.reshape(x, shape=[1, -1, hparams.hidden_size])
-  x_tiled = tf.reshape(
-      tf.tile(x, multiples=[hparams.num_blocks, 1, 1]),
-      shape=[hparams.num_blocks, -1, hparams.hidden_size])
-  x_projected = tf.matmul(x_tiled, hparams.projection_tensors)
-  x_projected = tf.transpose(x_projected, perm=[1, 0, 2])
-  return x_projected
-
-
-def slice_hidden(x, hparams):
-  """Slice encoder hidden state into block_dim.
-
-  Args:
-    x: Encoder hidden state of shape [-1, hidden_size]
-    hparams: Hparams
-
-  Returns:
-    Sliced states of shape [-1, num_blocks, block_dim].
-  """
-  assert hparams.num_blocks * hparams.block_dim == hparams.hidden_size
-  x_sliced = tf.reshape(x, shape=[-1, hparams.num_blocks, hparams.block_dim])
-  return x_sliced
-
-
-def nearest(x, means, hparams):
-  """Find the nearest means to elements in x."""
-  x_reshaped = hparams.reshape_fn(x, hparams)
-  x_norm_sq = tf.reduce_sum(tf.square(x_reshaped), axis=-1, keep_dims=True)
-  means_norm_sq = tf.reduce_sum(tf.square(means), axis=-1, keep_dims=True)
-  scalar_prod = tf.matmul(
-      tf.transpose(x_reshaped, perm=[1, 0, 2]),
-      tf.transpose(means, perm=[0, 2, 1]))
-  scalar_prod = tf.transpose(scalar_prod, perm=[1, 0, 2])
-  dist = x_norm_sq + tf.transpose(
-      means_norm_sq, perm=[2, 0, 1]) - 2 * scalar_prod
-  if hparams.random_top_k > 1:
-    _, top_k_idx = tf.nn.top_k(-dist, k=hparams.random_top_k)
-    nearest_idx = tf.gather(
-        top_k_idx,
-        tf.random_uniform(
-            [1], minval=0, maxval=hparams.random_top_k - 1, dtype=tf.int32),
-        axis=-1)
-  else:
-    nearest_idx = tf.argmax(-dist, axis=-1)
-  nearest_hot = tf.one_hot(nearest_idx, hparams.block_v_size)
-  shape = common_layers.shape_list(x)
-  shape[-1] = hparams.num_blocks
-  shape.append(hparams.block_v_size)
-  nearest_hot = tf.reshape(nearest_hot, shape=shape)
-  return tf.stop_gradient(nearest_hot)
-
-
-def kmeans(x, means, hparams):
-  """Compute the nearest neighbors and the loss for training the embeddings."""
-  x_means_hot = nearest(x, means, hparams)
-  x_means_hot_flat = tf.reshape(x_means_hot,
-                                [-1, hparams.num_blocks, hparams.block_v_size])
-  x_means = tf.matmul(tf.transpose(x_means_hot_flat, perm=[1, 0, 2]), means)
-  x_means = tf.transpose(x_means, [1, 0, 2])
-  x_reshaped = hparams.reshape_fn(x, hparams)
-  q_loss = tf.reduce_mean(tf.square((tf.stop_gradient(x_reshaped) - x_means)))
-  e_loss = tf.reduce_mean(tf.square(x_reshaped - tf.stop_gradient(x_means)))
-  return x_means_hot, x_means, q_loss, e_loss
-
-
-def bit_to_int(x_bit, nbits, base=2):
-  """Turn x_bit representing numbers bitwise (lower-endian) to int tensor."""
-  x_l = tf.stop_gradient(tf.reshape(x_bit, [-1, nbits]))
-  x_labels = []
-  for i in range(nbits):
-    x_labels.append(x_l[:, i] * tf.to_int32(base)**tf.to_int32(i))
-  res = sum(x_labels)
-  return tf.to_int32(tf.reshape(res, common_layers.shape_list(x_bit)[:-1]))
-
-
-def int_to_bit(x_int, nbits, base=2):
-  """Turn x_int representing numbers into a bitwise (lower-endian) tensor."""
-  x_l = tf.expand_dims(x_int, axis=-1)
-  x_labels = []
-  for i in range(nbits):
-    x_labels.append(
-        tf.floormod(
-            tf.floordiv(tf.to_int32(x_l),
-                        tf.to_int32(base)**i), tf.to_int32(base)))
-  res = tf.concat(x_labels, axis=-1)
-  return tf.to_float(res)
-
-
-def bottleneck(x,
-               hparams,
-               filter_size,
-               name,
-               means=None,
-               ema_count=None,
-               ema_means=None):
-  """Bottleneck."""
-  if hparams.bottleneck_kind == "vq-vae":
-    assert means is not None
-    if hparams.ema:
-      assert ema_count is not None
-      assert ema_means is not None
-
-  def embed(x):
-    """Embedding function; must be compatible with the code later."""
-    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
-      if hparams.bottleneck_kind == "semhash":
-        c = int_to_bit(x, z_size)
-        h1a = tf.layers.dense(c, filter_size, name="vch1a")
-        h1b = tf.layers.dense(1.0 - c, filter_size, name="vch1b")
-        h1 = h1a + h1b
-      elif hparams.bottleneck_kind == "gumbel-softmax":
-        hot = tf.one_hot(x, hparams.v_size)
-        h1 = tf.layers.dense(hot, hparams.hidden_size, name="dae_dense")
-      elif hparams.bottleneck_kind == "vq-vae":
-        shape_x = common_layers.shape_list(x)
-        x_flat = tf.reshape(x, [-1, 1])
-        c = int_to_bit(x_flat, nbits=hparams.z_size, base=2)
-        shape = common_layers.shape_list(c)
-        new_shape = shape
-        new_shape[-1] = hparams.num_blocks
-        new_shape.append(int(hparams.z_size / hparams.num_blocks))
-        c = tf.to_int32(tf.reshape(c, shape=new_shape))
-        c = bit_to_int(
-            c,
-            nbits=int(hparams.z_size / hparams.num_blocks),
-            base=2)
-        c_hot = tf.one_hot(c, depth=hparams.block_v_size, axis=-1)
-        c_hot_flat = tf.reshape(
-            c_hot, shape=[-1, hparams.num_blocks, hparams.block_v_size])
-        h1 = tf.matmul(tf.transpose(c_hot_flat, perm=[1, 0, 2]), means)
-        h1 = tf.transpose(h1, perm=[1, 0, 2])
-        new_shape = shape_x
-        new_shape.append(hparams.hidden_size)
-        h1 = tf.reshape(h1, new_shape)
-      elif hparams.bottleneck_kind == "rounding":
-        h1 = x
-
-      h2 = tf.layers.dense(tf.nn.relu(h1), filter_size, name="vch2")
-      return tf.layers.dense(tf.nn.relu(h2), hparams.hidden_size, name="vcfin")
-
-  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
-    z_size = hparams.z_size
-    l = tf.constant(0.0)
-    if hparams.bottleneck_kind == "dense":
-      c = tf.layers.dense(x, z_size, name="vcc")
-      h1 = tf.layers.dense(c, filter_size, name="vch1")
-    if hparams.bottleneck_kind == "vae":
-      c, l, _, _ = vae(x, z_size, "vae")
-      h1 = tf.layers.dense(c, filter_size, name="vch1")
-    if hparams.bottleneck_kind == "semhash":
-      c = tf.layers.dense(x, z_size, name="vcc")
-      y_clean = common_layers.saturating_sigmoid(c)
-      if _DO_SUMMARIES:
-        tf.summary.histogram("y_clean", tf.reshape(y_clean, [-1]))
-      if hparams.noise_dev > 0 and hparams.mode == tf.estimator.ModeKeys.TRAIN:
-        dev = hparams.noise_dev
-        noise = tf.truncated_normal(common_layers.shape_list(c),
-                                    mean=0.0, stddev=dev)
-        y = common_layers.saturating_sigmoid(c + noise)
-      else:
-        y = y_clean
-      d = tf.to_float(tf.less(0.5, y))
-      y_discrete = tf.stop_gradient(d) + y - tf.stop_gradient(y)
-      pd = common_layers.inverse_exp_decay(hparams.startup_steps * 2)
-      pd *= hparams.d_mix
-      pd = pd if hparams.mode == tf.estimator.ModeKeys.TRAIN else 1.0
-      c = tf.where(tf.less(tf.random_uniform(
-          [common_layers.shape_list(y)[0]]), pd), y_discrete, y)
-      h1a = tf.layers.dense(c, filter_size, name="vch1a")
-      h1b = tf.layers.dense(1.0 - c, filter_size, name="vch1b")
-      h1 = h1a + h1b
-      dx = tf.to_int32(tf.stop_gradient(d))
-      c = bit_to_int(dx, z_size)
-    if hparams.bottleneck_kind == "gumbel-softmax":
-      _, hot, l = dae(x, hparams, name)
-      c = tf.argmax(hot, axis=-1)
-      h1 = tf.layers.dense(hot, hparams.hidden_size, name="dae_dense")
-    if hparams.bottleneck_kind == "vq-vae":
-      x_means_hot, x_means, q_loss, e_loss = kmeans(x, means, hparams)
-
-      # Get the discrete latent represenation
-      x_means_idx = tf.argmax(x_means_hot, axis=-1)
-
-      # Get the binary representation
-      x_means_bits = int_to_bit(
-          x_means_idx,
-          nbits=int(hparams.z_size / hparams.num_blocks),
-          base=2)
-      shape = common_layers.shape_list(x_means_bits)
-      new_shape = shape[:-1]
-      new_shape[-1] = hparams.z_size
-      x_means_bits = tf.reshape(x_means_bits, shape=new_shape)
-      c = bit_to_int(
-          tf.to_int32(x_means_bits),
-          nbits=hparams.z_size,
-          base=2)
-
-      # Update the ema variables
-      if hparams.ema:
-        tf.logging.info("Using EMA with beta = {}".format(hparams.beta))
-        updated_ema_count = moving_averages.assign_moving_average(
-            ema_count,
-            tf.reduce_sum(
-                tf.reshape(
-                    x_means_hot,
-                    shape=[-1, hparams.num_blocks, hparams.block_v_size]),
-                axis=0),
-            hparams.decay,
-            zero_debias=False)
-
-        x_means_hot_flat = tf.reshape(
-            x_means_hot, shape=[-1, hparams.num_blocks, hparams.block_v_size])
-        x_reshaped = hparams.reshape_fn(x, hparams)
-        dw = tf.matmul(
-            tf.transpose(x_means_hot_flat, perm=[1, 2, 0]),
-            tf.transpose(x_reshaped, perm=[1, 0, 2]))
-        updated_ema_means = moving_averages.assign_moving_average(
-            ema_means, dw, hparams.decay, zero_debias=False)
-        n = tf.reduce_sum(updated_ema_count, axis=-1, keep_dims=True)
-        updated_ema_count = ((updated_ema_count + hparams.epsilon) /
-                             (n + hparams.v_size * hparams.epsilon) * n)
-        updated_ema_means /= tf.expand_dims(updated_ema_count, axis=-1)
-
-        with tf.control_dependencies([e_loss]):
-          update_means = tf.assign(means, updated_ema_means)
-          with tf.control_dependencies([update_means]):
-            l = hparams.beta * e_loss
-      else:
-        l = q_loss + hparams.beta * e_loss
-
-      x_reshaped = hparams.reshape_fn(x, hparams)
-      shape = common_layers.shape_list(x)
-      x_means = tf.reshape(x_means, shape)
-      x_reshaped = tf.reshape(x_reshaped, shape)
-      h1 = x_reshaped + tf.stop_gradient(x_means - x_reshaped)
-
-    if hparams.bottleneck_kind == "rounding":
-      h = tf.layers.dense(x, 1, name="vcc")
-
-      # Make h between 0 and 1
-      h = tf.sigmoid(h)
-
-      # Multiply by z_size to get it between [0, z_size]
-      h *= hparams.v_size
-
-      # Use the rounding bottleneck
-      h1 = h + tf.stop_gradient(tf.round(h) - h)
-      c = tf.squeeze(tf.round(h), axis=-1)
-      c = tf.to_int32(c)
-    h2 = tf.layers.dense(tf.nn.relu(h1), filter_size, name="vch2")
-    res = tf.layers.dense(tf.nn.relu(h2), hparams.hidden_size, name="vcfin")
-    return res, c, l, embed
-
-
 def compress(x, c, is_2d, hparams, name):
   """Compress."""
   with tf.variable_scope(name):
@@ -627,10 +302,7 @@ def ae_transformer_internal(inputs,
                             target_space,
                             hparams,
                             cache=None,
-                            predict_mask=1.0,
-                            means=None,
-                            ema_count=None,
-                            ema_means=None):
+                            predict_mask=1.0):
   """AE Transformer, main step used for training."""
   # Summaries break with the do_refine cond, turn them off in that case.
   global _DO_SUMMARIES
@@ -657,7 +329,7 @@ def ae_transformer_internal(inputs,
     # flatten here
     original_targets_shape = tf.shape(targets)
     if hparams.task == "image":
-      cia.maybe_reshape_4d_to_3d(targets, hparams)
+      cia.maybe_reshape_4d_to_3d(targets)
     if hparams.task == "translate":
       max_targets_len_from_inputs = tf.concat([inputs, inputs], axis=1)
     else:
@@ -666,12 +338,17 @@ def ae_transformer_internal(inputs,
     targets, _ = common_layers.pad_to_same_length(
         targets, max_targets_len_from_inputs,
         final_length_divisible_by=2**hparams.num_compress_steps)
-    targets_c = compress(targets, inputs, False, hparams, "compress")
+    if hparams.ae_input:
+      targets_c = compress(targets, inputs, False, hparams, "compress")
+    else:
+      targets_c = compress(targets, None, False, hparams, "compress")
     if hparams.mode != tf.estimator.ModeKeys.PREDICT:
       # Compress and bottleneck.
-      latents_dense, latents_discrete, extra_loss, embed = bottleneck(
-          targets_c, hparams,
-          hparams.compress_filter_size, "vc", means, ema_count, ema_means)
+      latents_dense, latents_discrete, extra_loss, embed = hparams.bottleneck(
+          x=targets_c,
+          filter_size=hparams.compress_filter_size,
+          name="vc",
+          mode=hparams.mode)
       if _DO_SUMMARIES:
         tf.summary.histogram("b0", tf.reshape(latents_discrete[:, 0, :], [-1]))
       pc = common_layers.inverse_exp_decay(hparams.startup_steps)
@@ -696,9 +373,11 @@ def ae_transformer_internal(inputs,
         losses["latent_pred"] = tf.reduce_mean((inputs_c - targets_c)**2) * 20
         def bn_inputs():
           with tf.variable_scope(tf.get_variable_scope(), reuse=True):
-            bn, _, _, _ = bottleneck(inputs_c, hparams,
-                                     hparams.compress_filter_size, "vc", means,
-                                     ema_count, ema_means)
+            bn, _, _, _ = hparams.bottleneck(
+                x=inputs_c,
+                filter_size=hparams.compress_filter_size,
+                name="vc",
+                mode=hparams.mode)
           return bn
         pbn = 0.8 if hparams.mode == tf.estimator.ModeKeys.TRAIN else 1.0
         inputs_c = tf.cond(tf.less(tf.random_uniform([]), pbn),
@@ -710,14 +389,15 @@ def bn_inputs():
     else:
       if hparams.bottleneck_kind in ["dense", "vae"]:
         inputs_c = decode_transformer(inputs, ed, targets_c, hparams, "dec_c")
-        latents_dense, _, _, _ = bottleneck(
-            inputs_c, hparams, hparams.compress_filter_size, "vc",
-            means, ema_count, ema_means)
+        latents_dense, _, _, _ = hparams.bottleneck(
+            x=inputs_c,
+            filter_size=hparams.compress_filter_size,
+            name="vc",
+            mode=hparams.mode)
       else:
         latent_len = common_layers.shape_list(targets_c)[1]
-        _, _, _, embed = bottleneck(targets_c, hparams,
-                                    hparams.compress_filter_size, "vc", means,
-                                    ema_count, ema_means)
+        _, _, _, embed = hparams.bottleneck(
+            x=targets_c, filter_size=hparams.compress_filter_size, name="vc")
         latents_dense = tf.zeros_like(targets_c[:, :latent_len, :, :])
         if cache is None:
           cache = ae_latent_sample(
@@ -785,56 +465,76 @@ def __init__(self, *args, **kwargs):
     super(TransformerAE, self).__init__(*args, **kwargs)
     self.predict_mask = 1.0
 
-    # Define the embeddings if we are using vq-vae
-    self.means = None
-    self.ema_count = None
-    self.ema_means = None
-    if self._hparams.bottleneck_kind == "vq-vae":
-      # Check that num_blocks exactly divides hidden_size and v_size
-      assert self._hparams.hidden_size % self._hparams.num_blocks == 0
-      assert self._hparams.v_size % self._hparams.num_blocks == 0
-
-      self._hparams.block_dim = int(
-          self._hparams.hidden_size // self._hparams.num_blocks)
-      self._hparams.block_v_size = 2**(
-          self._hparams.z_size / self._hparams.num_blocks)
-      self._hparams.block_v_size = int(self._hparams.block_v_size)
+    # Define bottleneck function
+    self._hparams.bottleneck = partial(
+        discretization.discrete_bottleneck,
+        hidden_size=self._hparams.hidden_size,
+        z_size=self._hparams.z_size,
+        filter_size=self._hparams.filter_size,
+        startup_steps=self.hparams.startup_steps,
+        bottleneck_kind=self._hparams.bottleneck_kind,
+        num_blocks=self._hparams.num_blocks,
+        reshape_method=self._hparams.reshape_method,
+        beta=self._hparams.beta,
+        noise_dev=self._hparams.noise_dev,
+        decay=self._hparams.decay,
+        discrete_mix=self._hparams.d_mix,
+        random_top_k=self._hparams.random_top_k,
+        epsilon=self._hparams.epsilon,
+        softmax_k=self._hparams.softmax_k,
+        kl_warmup_steps=self._hparams.kl_warmup_steps,
+        ema=self._hparams.ema,
+        summary=_DO_SUMMARIES,
+        dp_strength=self._hparams.dp_strength,
+        dp_decay=self._hparams.dp_decay,
+        dp_alpha=self._hparams.dp_alpha)
+
+    # Set the discretization bottleneck specific things here
+    if self._hparams.bottleneck_kind == "dvq":
+      block_dim = int(self._hparams.hidden_size // self._hparams.num_blocks)
+      block_v_size = 2**(self._hparams.z_size / self._hparams.num_blocks)
+      block_v_size = int(block_v_size)
 
       if self._hparams.reshape_method == "project":
-        tf.logging.info("Using projections for decomposed vq-vae")
+        tf.logging.info("Using projections for DVQ")
         tf.logging.info("Trainable projections = {}".format(
             self._hparams.trainable_projections))
-        self._hparams.projection_tensors = tf.get_variable(
+
+        projection_tensors = tf.get_variable(
             name="projection",
             shape=[
-                self._hparams.num_blocks, self._hparams.hidden_size,
-                self._hparams.block_dim
+                self._hparams.num_blocks, self._hparams.hidden_size, block_dim
             ],
             initializer=tf.contrib.layers.xavier_initializer(),
             trainable=self._hparams.trainable_projections)
-        self._hparams.reshape_fn = project_hidden
+
+        self._hparams.bottleneck = partial(
+            self._hparams.bottleneck, projection_tensors=projection_tensors)
       elif self._hparams.reshape_method == "slice":
-        tf.logging.info("Using slices for decomposed vq-vae")
-        self._hparams.reshape_fn = slice_hidden
+        tf.logging.info("Using slices for DVQ")
       else:
         raise ValueError("Unknown reshape method")
 
-      self.means = tf.get_variable(
+      means = tf.get_variable(
           name="means",
-          shape=[
-              self._hparams.num_blocks, self._hparams.block_v_size,
-              self._hparams.block_dim
-          ],
+          shape=[self._hparams.num_blocks, block_v_size, block_dim],
           initializer=tf.uniform_unit_scaling_initializer())
 
       # Create the shadow variables if we are using EMA
       if self._hparams.ema:
-        self.ema_count = tf.get_variable(
-            "ema_count", [self._hparams.num_blocks, self._hparams.block_v_size],
+        ema_count = tf.get_variable(
+            "ema_count", [self._hparams.num_blocks, block_v_size],
             initializer=tf.constant_initializer(0))
-        with tf.colocate_with(self.means):
-          self.ema_means = tf.get_variable(
-              "ema_means", initializer=self.means.initialized_value())
+        with tf.colocate_with(means):
+          ema_means = tf.get_variable(
+              "ema_means", initializer=means.initialized_value())
+
+        # Update bottleneck
+        self._hparams.bottleneck = partial(
+            self._hparams.bottleneck,
+            means=means,
+            ema_count=ema_count,
+            ema_means=ema_means)
 
   @property
   def has_input(self):
@@ -852,10 +552,7 @@ def body(self, features):
           features["target_space_id"],
           self._hparams,
           features.get("cache_raw", None),
-          predict_mask=self.predict_mask,
-          means=self.means,
-          ema_count=self.ema_count,
-          ema_means=self.ema_means)
+          predict_mask=self.predict_mask)
       return res, loss
 
   def prepare_features_for_infer(self, features):
@@ -870,8 +567,7 @@ def prepare_features_for_infer(self, features):
     targets = tf.zeros([beam_batch_size, 1, 1, self._hparams.hidden_size])
     with tf.variable_scope("body"):
       _, _, cache = ae_transformer_internal(
-          inputs, targets, features["target_space_id"], self._hparams,
-          self.means, self.ema_count, self.ema_means)
+          inputs, targets, features["target_space_id"], self._hparams)
     features["cache_raw"] = cache
 
   def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1,
@@ -935,13 +631,19 @@ def transformer_ae_small():
   hparams.add_hparam("z_size", 14)
   hparams.add_hparam("noise_dev", 0.5)
   hparams.add_hparam("d_mix", 0.5)
-  # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae.
+  # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, dvq.
   hparams.add_hparam("bottleneck_kind", "semhash")
   hparams.add_hparam("num_blocks", 1)
   hparams.add_hparam("num_decode_blocks", 1)
-  # Reshape method for decomposed vq-vae: slice, project
+  # Reshape method for DVQ: slice, project
   hparams.add_hparam("reshape_method", "slice")
   hparams.add_hparam("trainable_projections", False)
+  # Add option to pass the input to the autoencoder
+  hparams.add_hparam("ae_input", False)
+  # Hparams for Dirichlet process process
+  hparams.add_hparam("dp_alpha", 0.5)
+  hparams.add_hparam("dp_strength", 0.25)
+  hparams.add_hparam("dp_decay", 1.0)
   hparams.add_hparam("unmasked_percentage", 0.1)
   hparams.add_hparam("do_ae", True)
   hparams.add_hparam("do_mask", True)
@@ -1086,3 +788,12 @@ def transformer_ae_a8():
   hparams.optimizer = "Adafactor"
   hparams.noise_dev = 0.5
   return hparams
+
+
+@registry.register_hparams
+def transformer_ae_base_tpu():
+  """Base config adjusted for TPU."""
+  hparams = transformer_ae_base()
+  transformer.update_hparams_for_tpu(hparams)
+  hparams.batch_size = 512
+  return hparams
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 11d446f5b..09b252291 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -882,7 +882,8 @@ def transformer_base_v1():
   hparams.max_length = 256
   hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
   hparams.optimizer_adam_epsilon = 1e-9
-  hparams.learning_rate_schedule = "linear_warmup_rsqrt_decay"
+  hparams.learning_rate_schedule = "legacy"
+  hparams.learning_rate_decay_scheme = "noam"
   hparams.learning_rate = 0.1
   hparams.learning_rate_warmup_steps = 4000
   hparams.initializer_gain = 1.0
@@ -943,6 +944,11 @@ def transformer_base():
   # transformer_base_v2.
   hparams = transformer_base_v2()
   hparams.optimizer_adam_beta2 = 0.997
+  # New way of specifying learning rate schedule.
+  # Equivalent to previous version.
+  hparams.learning_rate_schedule = (
+      "constant*linear_warmup*rsqrt_decay*rsqrt_hidden_size")
+  hparams.learning_rate_constant = 2.0
   return hparams
 
 
@@ -1279,7 +1285,10 @@ def update_hparams_for_tpu(hparams):
   """Change hparams to be compatible with TPU training."""
 
   # Adafactor uses less memory than Adam.
+  # switch to Adafactor with its recommended learning rate scheme.
   hparams.optimizer = "Adafactor"
+  hparams.learning_rate_schedule = "rsqrt_decay"
+  hparams.learning_rate_warmup_steps = 10000
 
   # Avoid an expensive concat on TPU.
   # >1 shards helps with faster parameter distribution on multi-GPU machines
diff --git a/tensor2tensor/test_data/example_usr_dir/my_submodule.py b/tensor2tensor/test_data/example_usr_dir/my_submodule.py
index 70929afbc..e3ffd962c 100644
--- a/tensor2tensor/test_data/example_usr_dir/my_submodule.py
+++ b/tensor2tensor/test_data/example_usr_dir/my_submodule.py
@@ -56,10 +56,10 @@ def dataset_splits(self):
     # 10% evaluation data
     return [{
         "split": problem.DatasetSplit.TRAIN,
-        "shards": 90,
+        "shards": 9,
     }, {
         "split": problem.DatasetSplit.EVAL,
-        "shards": 10,
+        "shards": 1,
     }]
 
   def generate_samples(self, data_dir, tmp_dir, dataset_split):
diff --git a/tensor2tensor/test_data/example_usr_dir/requirements.txt b/tensor2tensor/test_data/example_usr_dir/requirements.txt
new file mode 100644
index 000000000..3678319be
--- /dev/null
+++ b/tensor2tensor/test_data/example_usr_dir/requirements.txt
@@ -0,0 +1 @@
+gutenberg
diff --git a/tensor2tensor/test_data/example_usr_dir/setup.py b/tensor2tensor/test_data/example_usr_dir/setup.py
deleted file mode 100644
index ad3701bb2..000000000
--- a/tensor2tensor/test_data/example_usr_dir/setup.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Example setup.py for a t2t_usr_dir launching on Cloud ML Engine.
-
-This is only necessary if you have additional required pip packages for the
-import of your usr_dir, and only if you're launching t2t-trainer on Cloud ML
-Engine with the --cloud_mlengine flag.
-
-Note that the call to setup uses find_packages() and that the location of this
-file is alongside the __init__.py file that imports my_submodule.
-"""
-from setuptools import find_packages
-from setuptools import setup
-setup(
-    name='DummyUsrDirPackage',
-    version='0.1',
-    packages=find_packages(),
-    install_requires=[
-        'gutenberg',
-    ],
-)
diff --git a/tensor2tensor/utils/adafactor.py b/tensor2tensor/utils/adafactor.py
new file mode 100644
index 000000000..de14aff52
--- /dev/null
+++ b/tensor2tensor/utils/adafactor.py
@@ -0,0 +1,308 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import tensorflow as tf
+
+
+class AdafactorOptimizer(tf.train.Optimizer):
+  """Optimizer that implements the Adafactor algorithm.
+
+  Adafactor is described in TODO(noam): post paper to arxiv.
+
+  Adafactor is most similar to Adam (Kingma and Ba), the major differences are:
+
+  1. For a two-dimensional AxB weight matrix, Adafactor uses only A+B auxiliary
+     parameters to maintain the second-moment estimator, instead of AB.
+     This is advantagous on memory-limited systems.  In addition, beta1
+     (momentum) is set to zero by default, saving an additional auxiliary
+     parameter per weight.
+
+  2. Adafactor incorporates "update-clipping" - a scale-invariant analog of
+     gradient clipping.  This adds stability
+
+  3. Adafactor does not require an external "learning rate".  By default, it
+     incorporates a relative-update-scale schedule, corresponding to
+     inverse-square-root learning-rate-decay in ADAM.  We hope this works well
+     for most applications.
+
+  ALGORITHM:
+
+  parameter -= absolute_update_scale * clip(grad / grad_scale)
+
+  where:
+
+    absolute_update_scale := relative_update_scale * parameter_scale
+    relative_update_scale := min((step_num + 1)**-0.5, 1e-2)
+    parameter_scale := max(rms(var)), 1e-3)
+    clip(x) := x / max(1.0, rms(x))
+    grad_scale := tf.sqrt(v)   (v is the second-moment estimator)
+
+  The second-moment estimator v is maintained in a manner similar to Adam:
+  We initialize
+  ```
+  if var is 2-dimensional:
+    v_r <- zeros([num_rows])
+    v_c <- zeros([num_cols])
+  else:
+    v <- zeros(shape(var))
+  ```
+
+  The update rule is as follows:
+  ```
+  decay_rate = 1 - (step_num + 1) ^ -0.8
+  grad_squared = tf.square(grad) + epsilon
+  if var is 2-dimensional:
+    v_r <- decay_rate * v_r + (1 - decay_rate) * reduce_mean(grad_squared, 1)
+    v_c <- decay_rate * v_c + (1 - decay_rate) * reduce_mean(grad_squared, 0)
+    v = outer_prod(v_r, v_c) / reduce_mean(v_r)
+  else:
+    v <- decay_rate * v + (1 - decay_rate) * grad_squared
+  ```
+
+
+  Several parts of this algorithm are configurable from the initializer.
+
+    multiply_by_parameter_scale:  If True, then compute absolute_update_scale
+      as described above.  If False, let absolute_update_scale be the externally
+      supplied learning_rate.
+    learning_rate: represents relative_update_scale if
+      multiply_by_parameter_scale==True, or absolute_update_scale if
+      multiply_by_parameter_scale==False.
+    decay_rate: Decay rate of the second moment estimator (varies by step_num).
+      This should be set to a function such that:
+      1-1/(step_num + 1) <= decay_rate(step_num) < 1.0
+    beta1: enables momentum, as in Adam.  Uses extra memory if nonzero.
+    clipping_threshold: should be >=1.0 or None for no update clipping
+    factored: whether to factor the second-moment estimator.  True means
+      less memory usage.
+
+  TODO(noam): we should also apply the 2d logic to the two final dimensions.
+    of >2d convolutional kernels.
+  """
+
+  def __init__(self,
+               multiply_by_parameter_scale=True,
+               learning_rate=None,
+               decay_rate=None,
+               beta1=0.0,
+               clipping_threshold=1.0,
+               factored=True,
+               use_locking=False,
+               name="Adafactor"):
+    """Construct a new Adafactor optimizer.
+
+    See class comment.
+
+    Args:
+      multiply_by_parameter_scale: a boolean
+      learning_rate: an optional Scalar.
+      decay_rate: an optional Scalar.
+      beta1: a float value between 0 and 1
+      clipping_threshold: an optional float >= 1
+      factored: a boolean - whether to use factored second-moment estimator
+        for 2d variables
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "AdafactorOptimizer".
+
+    Raises:
+      ValueError: if absolute_update_scale and relative_update_scale_fn are both
+        present or both absent.
+    """
+    super(AdafactorOptimizer, self).__init__(use_locking, name)
+    self._multiply_by_parameter_scale = multiply_by_parameter_scale
+    if learning_rate is None:
+      learning_rate = self._learning_rate_default(multiply_by_parameter_scale)
+    self._learning_rate = learning_rate
+    if decay_rate is None:
+      decay_rate = self._decay_rate_default()
+    self._decay_rate = decay_rate
+    self._beta1 = beta1
+    self._clipping_threshold = clipping_threshold
+    self._factored = factored
+
+  def _should_use_factored_second_moment_estimate(self, shape):
+    """Should we use a factored second moment estimator.
+
+    Based on the shape of the variable.
+
+    Args:
+      shape: a list of integers
+    Returns:
+      a boolean
+    """
+    return self._factored and len(shape) == 2
+
+  def _create_slots(self, var_list):
+    for var in var_list:
+      shape = var.get_shape().as_list()
+      if self._beta1:
+        self._zeros_slot(var, "m", self._name)
+      if self._should_use_factored_second_moment_estimate(shape):
+        r_val = tf.zeros([shape[0]], dtype=tf.float32)
+        c_val = tf.zeros([shape[1]], dtype=tf.float32)
+        self._get_or_make_slot(var, r_val, "vr", self._name)
+        self._get_or_make_slot(var, c_val, "vc", self._name)
+      else:
+        self._zeros_slot(var, "v", self._name)
+
+  def _apply_dense(self, grad, var):
+    return self._resource_apply_dense(grad, var)
+
+  def _parameter_scale(self, var):
+    """Estimate the scale of the parameters from the current values.
+
+    We include a minimum value of 0.001 to give it a chance to escape 0
+    if it was zero-initialized.
+
+    Instead of using the value, we could impute the scale from the shape,
+    as initializers do.
+
+    Args:
+      var: a variable or Tensor.
+    Returns:
+      a Scalar
+    """
+    return tf.maximum(reduce_rms(var), 0.001)
+
+  def _resource_apply_dense(self, grad, var):
+    grad_squared = tf.square(grad) + 1e-30
+    grad_squared_mean = tf.reduce_mean(grad_squared)
+    decay_rate = self._decay_rate
+    update_scale = self._learning_rate
+    if self._multiply_by_parameter_scale:
+      update_scale *= self._parameter_scale(var)
+    # HACK: Make things dependent on grad.
+    # This confounds the XLA rewriter and keeps it from fusing computations
+    # across different variables.  This fusion is a bad for HBM usage, since
+    # it causes the gradients to persist in memory.
+    decay_rate += grad_squared_mean * 1e-30
+    update_scale += grad_squared_mean * 1e-30
+    # END HACK
+    mixing_rate = 1.0 - decay_rate
+    shape = var.get_shape().as_list()
+    updates = []
+    if self._should_use_factored_second_moment_estimate(shape):
+      grad_squared_row_mean = tf.reduce_mean(grad_squared, 1)
+      grad_squared_col_mean = tf.reduce_mean(grad_squared, 0)
+      vr = self.get_slot(var, "vr")
+      new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean)
+      vc = self.get_slot(var, "vc")
+      new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean)
+      vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking)
+      vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking)
+      updates = [vr_update, vc_update]
+      long_term_mean = tf.reduce_mean(new_vr)
+      r_factor = tf.rsqrt(new_vr / long_term_mean)
+      c_factor = tf.rsqrt(new_vc)
+      x = grad * tf.expand_dims(r_factor, 1) * tf.expand_dims(c_factor, 0)
+    else:
+      v = self.get_slot(var, "v")
+      new_v = decay_rate * v + mixing_rate * grad_squared
+      v_update = tf.assign(v, new_v, use_locking=self._use_locking)
+      updates = [v_update]
+      x = grad * tf.rsqrt(new_v)
+    if self._clipping_threshold is not None:
+      clipping_denom = tf.maximum(1.0, reduce_rms(x) / self._clipping_threshold)
+      x /= clipping_denom
+    subtrahend = update_scale * x
+    if self._beta1:
+      m = self.get_slot(var, "m")
+      new_m = self._beta1 * m + (1.0 - self._beta1) * subtrahend
+      updates.append(tf.assign(m, new_m, use_locking=self._use_locking))
+      subtrahend = new_m
+    var_update = tf.assign_sub(var, subtrahend, use_locking=self._use_locking)
+    updates = [var_update] + updates
+    return tf.group(*updates)
+
+  def _decay_rate_default(self):
+    return adafactor_decay_rate_pow(0.8)
+
+  def _learning_rate_default(self, multiply_by_parameter_scale):
+    learning_rate = tf.minimum(tf.rsqrt(step_num() + 1.0), 0.01)
+    if not multiply_by_parameter_scale:
+      learning_rate *= 0.05
+    return learning_rate
+
+
+def adafactor_decay_rate_adam(beta2):
+  """Second-moment decay rate like Adam, subsuming the correction factor.
+
+  Args:
+    beta2: a float between 0 and 1
+  Returns:
+    a scalar
+  """
+  t = tf.to_float(tf.train.get_or_create_global_step()) + 1.0
+  decay = beta2 * (1.0 - tf.pow(beta2, t - 1.0)) / (1.0 - tf.pow(beta2, t))
+  # decay = tf.cond(tf.equal(t, 1.0), lambda: beta2, lambda: decay)
+  return decay
+
+
+def adafactor_decay_rate_pow(exponent):
+  """Second moment decay rate where memory-length grows as step_num^exponent.
+
+  Args:
+    exponent: a float between 0 and 1
+  Returns:
+    a scalar
+  """
+  return 1.0 - tf.pow((step_num() + 1.0), -exponent)
+
+
+def step_num():
+  return tf.to_float(tf.train.get_or_create_global_step())
+
+
+def adafactor_optimizer_from_hparams(hparams, lr):
+  """Create an Adafactor optimizer based on model hparams.
+
+  Args:
+    hparams: model hyperparameters
+    lr: learning rate scalar.
+  Returns:
+    an AdafactorOptimizer
+  Raises:
+    ValueError: on illegal values
+  """
+  if hparams.optimizer_adafactor_decay_type == "Adam":
+    decay_rate = adafactor_decay_rate_adam(
+        hparams.optimizer_adafactor_beta2)
+  elif hparams.optimizer_adafactor_decay_type == "pow":
+    decay_rate = adafactor_decay_rate_pow(
+        hparams.optimizer_adafactor_memory_exponent)
+  else:
+    raise ValueError("unknown optimizer_adafactor_decay_type")
+  return AdafactorOptimizer(
+      multiply_by_parameter_scale=(
+          hparams.optimizer_adafactor_multiply_by_parameter_scale),
+      learning_rate=lr,
+      decay_rate=decay_rate,
+      beta1=hparams.optimizer_adafactor_beta1,
+      clipping_threshold=hparams.optimizer_adafactor_clipping_threshold,
+      factored=hparams.optimizer_adafactor_factored,
+      use_locking=False,
+      name="Adafactor")
+
+
+def reduce_rms(x):
+  return tf.sqrt(tf.reduce_mean(tf.square(x)))
diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py
old mode 100644
new mode 100755
index 1d9e1c591..9d0cc0f4a
--- a/tensor2tensor/utils/cloud_mlengine.py
+++ b/tensor2tensor/utils/cloud_mlengine.py
@@ -36,15 +36,20 @@
 # TODO(rsepassi):
 # * Enable multi-machine sync/async training
 
-SETUP_PY = """
+
+def get_setup_file(name, packages=None):
+  if not packages:
+    packages = []
+  return """
 from setuptools import find_packages
 from setuptools import setup
 setup(
-    name='DummyUsrDirPackage',
+    name='{name}',
     version='0.1',
     packages=find_packages(),
+    install_requires={pypi_packages}
 )
-"""
+""".format(name=name, pypi_packages=str(list(packages)))
 
 
 def job_dir():
@@ -52,6 +57,15 @@ def job_dir():
   return getattr(FLAGS, 'job-dir', '') or getattr(FLAGS, 'job_dir', '')
 
 
+def get_requirements(usr_dir):
+  requirements_file = os.path.join(usr_dir, 'requirements.txt')
+  if not tf.gfile.Exists(requirements_file):
+    return []
+  with tf.gfile.Open(requirements_file) as f:
+    pkg_list = f.readlines()
+    return [pkg.strip() for pkg in pkg_list if 'tensor2tensor' not in pkg]
+
+
 def flags_as_args():
   """Convert FLAGS to list of args suitable for passing on cmd line."""
   if hasattr(FLAGS, 'flag_values_dict'):
@@ -77,27 +91,32 @@ def flags_as_args():
 
 def machine_config(num_gpus=1, use_tpu=False, master_type=None):
   """Return dict specifying machine config for trainingInput."""
-  scale_tier = 'BASIC_GPU'
   if use_tpu:
-    scale_tier = 'BASIC_TPU'
+    master_type = 'standard_tpu'
   elif num_gpus <= 0:
-    scale_tier = 'BASIC'
-  elif num_gpus > 1:
-    scale_tier = 'CUSTOM'
-
-  config = {'scaleTier': scale_tier}
-
-  if scale_tier == 'CUSTOM':
-    assert num_gpus > 1
-    if num_gpus not in [4, 8]:
+    master_type = master_type or 'standard'
+    cpu_types = ['standard', 'large_model', 'complex_model_s',
+                 'complex_model_m', 'complex_model_l']
+    if master_type not in cpu_types:
+      raise ValueError('Expected `cloudml_engine_master_type` to be one of %s '
+                       'when `worker_gpu` <= 0, found %s.', str(cpu_types),
+                       master_type)
+  elif num_gpus >= 1:
+    if num_gpus == 1:
+      if master_type != 'standard_gpu':
+        master_type = 'standard_p100'
+    elif num_gpus == 4:
+      if master_type != 'complex_model_m_gpu':
+        master_type = 'complex_model_m_p100'
+    elif num_gpus == 8:
+      master_type = 'complex_model_l_gpu'
+    else:
       raise ValueError('Must use exactly 1, 4, or 8 GPUs.')
-    config['masterType'] = ('complex_model_m_gpu'
-                            if num_gpus == 4 else 'complex_model_l_gpu')
-
-  if master_type:
-    config['masterType'] = master_type
-
-  return config
+  assert master_type
+  return {
+      'scaleTier': 'CUSTOM',
+      'masterType': master_type
+  }
 
 
 def configure_job():
@@ -131,9 +150,6 @@ def configure_job():
         FLAGS.autotune_parallel_trials,
     )
 
-  if training_input['scaleTier'] == 'CUSTOM':
-    assert 'masterType' in training_input
-
   timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
   job_name = '%s_%s_t2t_%s' % (FLAGS.model, FLAGS.problems, timestamp)
   job_spec = {'jobId': job_name, 'trainingInput': training_input}
@@ -173,7 +189,38 @@ def _tar_and_copy(src_dir, target_dir):
 def tar_and_copy_t2t(train_dir):
   """Tar Tensor2Tensor and cp to train_dir."""
   tf.logging.info('Tarring and pushing local Tensor2Tensor package.')
-  t2t_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+
+  output = cloud.shell_output('pip show tensor2tensor').split('\n')
+  assert output[1].startswith('Version')
+  assert output[7].startswith('Location')
+  t2t_version = output[1].split(':')[1].strip()
+  t2t_dir = output[7].split(':')[1].strip()
+
+  # A local installation cloned from GitHub will have a setup.py file and a docs
+  # folder
+  is_local_t2t = all([
+      tf.gfile.Exists(os.path.join(t2t_dir, fname))
+      for fname in ['setup.py', 'docs/cloud_mlengine.md']
+  ])
+
+  if is_local_t2t:
+    tf.logging.info('Found local T2T installation. Tarring directory %s',
+                    t2t_dir)
+  else:
+    # PyPI installation
+    # Create a folder with just a setup.py file pointing to the right version
+    tf.logging.info('Found PyPI T2T installation. Launching tensor2tensor==%s',
+                    t2t_version)
+    t2t_dir = os.path.join(tempfile.gettempdir(), 'tensor2tensor_tmp')
+    shutil.rmtree(t2t_dir, ignore_errors=True)
+    os.mkdir(t2t_dir)
+    setup_fname = os.path.join(t2t_dir, 'setup.py')
+    setup_file_str = get_setup_file(
+        name='DummyT2TPackage',
+        packages=['tensor2tensor==%s' % t2t_version]
+    )
+    with tf.gfile.Open(setup_fname, 'w') as f:
+      f.write(setup_file_str)
   t2t_tar = _tar_and_copy(t2t_dir, train_dir)
   return t2t_tar
 
@@ -189,13 +236,12 @@ def tar_and_copy_usr_dir(usr_dir, train_dir):
   shutil.copytree(usr_dir, tmp_usr_dir)
   # Insert setup.py if one does not exist
   top_setup_fname = os.path.join(top_dir, 'setup.py')
-  usr_setup_fname = os.path.join(tmp_usr_dir, 'setup.py')
-  if tf.gfile.Exists(usr_setup_fname):
-    tf.gfile.Copy(usr_setup_fname, top_setup_fname)
-    tf.gfile.Remove(usr_setup_fname)
-  else:
-    with tf.gfile.Open(top_setup_fname, 'w') as f:
-      f.write(SETUP_PY)
+  setup_file_str = get_setup_file(
+      name='DummyUsrDirPackage',
+      packages=get_requirements(usr_dir)
+  )
+  with tf.gfile.Open(top_setup_fname, 'w') as f:
+    f.write(setup_file_str)
   usr_tar = _tar_and_copy(top_dir, train_dir)
   return usr_tar
 
diff --git a/tensor2tensor/utils/learning_rate.py b/tensor2tensor/utils/learning_rate.py
new file mode 100644
index 000000000..169b59348
--- /dev/null
+++ b/tensor2tensor/utils/learning_rate.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+import tensorflow as tf
+
+
+def learning_rate_factor(name, step_num, hparams):
+  if name == "constant":
+    return hparams.learning_rate_constant
+  elif name == "linear_warmup":
+    return tf.minimum(1.0, step_num / hparams.learning_rate_warmup_steps)
+  elif name == "rsqrt_decay":
+    return tf.rsqrt(tf.maximum(step_num, hparams.learning_rate_warmup_steps))
+  elif name == "rsqrt_hidden_size":
+    return hparams.hidden_size ** -0.5
+  elif name == "legacy":
+    return legacy_learning_rate_schedule(hparams)
+  else:
+    raise ValueError("unknown learning rate factor %s" % name)
+
+
+def learning_rate_schedule(hparams):
+  """Learning rate schedule based on hparams."""
+  step_num = tf.to_float(tf.train.get_or_create_global_step())
+  schedule_string = hparams.learning_rate_schedule
+  names = schedule_string.split("*")
+  names = [name.strip() for name in names if name.strip()]
+  ret = 1.0
+  for name in names:
+    ret *= learning_rate_factor(name, step_num, hparams)
+  return ret
+
+
+def legacy_learning_rate_schedule(hparams):
+  """Backwards-compatible learning-rate schedule."""
+  step_num = tf.to_float(tf.train.get_or_create_global_step())
+  warmup_steps = tf.to_float(hparams.learning_rate_warmup_steps)
+  if hparams.learning_rate_decay_scheme == "noam":
+    ret = 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
+        (step_num + 1) * warmup_steps**-1.5, (step_num + 1)**-0.5)
+  else:
+    warmup_steps = hparams.learning_rate_warmup_steps
+    warmup = _learning_rate_warmup(warmup_steps)
+    decay = _learning_rate_decay(hparams, warmup_steps)
+    ret = tf.where(step_num < warmup_steps, warmup, decay)
+  optimizer_correction = 0.002 if "Adam" in hparams.optimizer else 1.0
+  return ret * optimizer_correction * hparams.learning_rate
+
+
+def _legacy_sqrt_decay(step):
+  """Decay like 1 / sqrt(step), multiplied by 500 to normalize."""
+  return 500.0 / tf.sqrt(tf.maximum(step, 1.0))
+
+
+def _piecewise_learning_rate(step, boundaries, values):
+  """Scale learning rate according to the given schedule.
+
+  Multipliers are not cumulative.
+
+  Args:
+    step: global step
+    boundaries: List of steps to transition on.
+    values: Multiplier to apply at each boundary transition.
+
+  Returns:
+    Scaled value for the learning rate.
+  """
+  values = [1.0] + values
+  return tf.train.piecewise_constant(
+      step, boundaries, values, name="piecewise_lr")
+
+
+def _learning_rate_decay(hparams, warmup_steps=0):
+  """Learning rate decay multiplier."""
+  scheme = hparams.learning_rate_decay_scheme
+  warmup_steps = tf.to_float(warmup_steps)
+  global_step = tf.to_float(tf.train.get_or_create_global_step())
+
+  if not scheme or scheme == "none":
+    return tf.constant(1.)
+
+  tf.logging.info("Applying learning rate decay: %s.", scheme)
+
+  if scheme == "exp":
+    decay_steps = hparams.learning_rate_decay_steps
+    p = (global_step - warmup_steps) / decay_steps
+    if hparams.learning_rate_decay_staircase:
+      p = tf.floor(p)
+    return tf.pow(hparams.learning_rate_decay_rate, p)
+
+  if scheme == "piecewise":
+    return _piecewise_learning_rate(global_step,
+                                    hparams.learning_rate_boundaries,
+                                    hparams.learning_rate_multiples)
+
+  if scheme == "cosine":
+    cycle_steps = hparams.learning_rate_cosine_cycle_steps
+    cycle_position = global_step % (2 * cycle_steps)
+    cycle_position = cycle_steps - tf.abs(cycle_steps - cycle_position)
+    return 0.5 * (1 + tf.cos(np.pi * cycle_position / cycle_steps))
+
+  if scheme == "cyclelinear10x":
+    # Cycle the rate linearly by 10x every warmup_steps, up and down.
+    cycle_steps = warmup_steps
+    cycle_position = global_step % (2 * cycle_steps)
+    cycle_position = tf.to_float(  # Normalize to the interval [-1, 1].
+        cycle_position - cycle_steps) / float(cycle_steps)
+    cycle_position = 1.0 - tf.abs(cycle_position)  # 0 to 1 and back to 0.
+    return (cycle_position + 0.1) * 3.0  # 10x difference each cycle (0.3-3).
+
+  if scheme == "sqrt":
+    return _legacy_sqrt_decay(global_step - warmup_steps)
+
+  raise ValueError("Unrecognized learning rate decay scheme: %s" %
+                   hparams.learning_rate_decay_scheme)
+
+
+def _learning_rate_warmup(warmup_steps, warmup_schedule="exp"):
+  """Learning rate warmup multiplier."""
+  if not warmup_steps:
+    return tf.constant(1.)
+
+  tf.logging.info("Applying %s learning rate warmup for %d steps",
+                  warmup_schedule, warmup_steps)
+
+  warmup_steps = tf.to_float(warmup_steps)
+  global_step = tf.to_float(tf.train.get_or_create_global_step())
+
+  if warmup_schedule == "exp":
+    return tf.exp(tf.log(0.01) / warmup_steps)**(warmup_steps - global_step)
+  else:
+    assert warmup_schedule == "linear"
+    start = tf.constant(0.35)
+    return ((tf.constant(1.) - start) / warmup_steps) * global_step + start
diff --git a/tensor2tensor/utils/optimize.py b/tensor2tensor/utils/optimize.py
index 6b303d7d0..3d5526535 100644
--- a/tensor2tensor/utils/optimize.py
+++ b/tensor2tensor/utils/optimize.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 
+from tensor2tensor.utils import adafactor
 from tensor2tensor.utils import yellowfin
 
 import tensorflow as tf
@@ -82,7 +83,7 @@ def __init__(self, optimizer_name, lr, hparams, use_tpu=False):
       # We change the default epsilon for Adam and re-scale lr.
       # Using LazyAdam as it's much faster for large vocabulary embeddings.
       self._opt = tf.contrib.opt.LazyAdamOptimizer(
-          lr / 500.0,
+          lr,
           beta1=hparams.optimizer_adam_beta1,
           beta2=hparams.optimizer_adam_beta2,
           epsilon=hparams.optimizer_adam_epsilon)
@@ -96,12 +97,12 @@ def __init__(self, optimizer_name, lr, hparams, use_tpu=False):
           learning_rate=lr, momentum=hparams.optimizer_momentum_momentum)
     elif optimizer_name == "TrueAdam":
       self._opt = tf.train.AdamOptimizer(
-          lr / 500.0,
+          lr,
           beta1=hparams.optimizer_adam_beta1,
           beta2=hparams.optimizer_adam_beta2,
           epsilon=hparams.optimizer_adam_epsilon)
     elif optimizer_name == "Adafactor":
-      self._opt = AdafactorOptimizer(lr / 500.0)
+      self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr)
     else:
       self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
 
@@ -113,130 +114,6 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
         grads_and_vars, global_step=global_step, name=name)
 
 
-def _sqrt_decay(step):
-  """Decay like 1 / sqrt(step), multiplied by 500 to normalize."""
-  return 500.0 / tf.sqrt(tf.maximum(step, 1.0))
-
-
-def _exp_decay_after(step, rate, from_which_step):
-  """Decay exponentially by rate (per step) starting at from_which_step."""
-  return tf.cond(
-      step < from_which_step,
-      lambda: tf.constant(1.0),
-      lambda: rate**(step - from_which_step),
-      name="exponential_decay_step_cond")
-
-
-def piecewise_learning_rate(step, boundaries, values):
-  """Scale learning rate according to the given schedule.
-
-  Multipliers are not cumulative.
-
-  Args:
-    step: global step
-    boundaries: List of steps to transition on.
-    values: Multiplier to apply at each boundary transition.
-
-  Returns:
-    Scaled value for the learning rate.
-  """
-  values = [1.0] + values
-  return tf.train.piecewise_constant(
-      step, boundaries, values, name="piecewise_lr")
-
-
-def learning_rate_decay(hparams, warmup_steps=0):
-  """Learning rate decay multiplier."""
-  scheme = hparams.learning_rate_decay_scheme
-  warmup_steps = tf.to_float(warmup_steps)
-  global_step = tf.to_float(tf.train.get_or_create_global_step())
-
-  if not scheme or scheme == "none":
-    return tf.constant(1.)
-
-  tf.logging.info("Applying learning rate decay: %s.", scheme)
-
-  if scheme == "exp":
-    decay_steps = hparams.learning_rate_decay_steps
-    p = (global_step - warmup_steps) / decay_steps
-    if hparams.learning_rate_decay_staircase:
-      p = tf.floor(p)
-    return tf.pow(hparams.learning_rate_decay_rate, p)
-
-  if scheme == "piecewise":
-    return piecewise_learning_rate(global_step,
-                                   hparams.learning_rate_boundaries,
-                                   hparams.learning_rate_multiples)
-
-  if scheme == "cosine":
-    cycle_steps = hparams.learning_rate_cosine_cycle_steps
-    cycle_position = global_step % (2 * cycle_steps)
-    cycle_position = cycle_steps - tf.abs(cycle_steps - cycle_position)
-    return 0.5 * (1 + tf.cos(np.pi * cycle_position / cycle_steps))
-
-  if scheme == "cyclelinear10x":
-    # Cycle the rate linearly by 10x every warmup_steps, up and down.
-    cycle_steps = warmup_steps
-    cycle_position = global_step % (2 * cycle_steps)
-    cycle_position = tf.to_float(  # Normalize to the interval [-1, 1].
-        cycle_position - cycle_steps) / float(cycle_steps)
-    cycle_position = 1.0 - tf.abs(cycle_position)  # 0 to 1 and back to 0.
-    return (cycle_position + 0.1) * 3.0  # 10x difference each cycle (0.3-3).
-
-  if scheme == "sqrt":
-    return _sqrt_decay(global_step - warmup_steps)
-
-  raise ValueError("Unrecognized learning rate decay scheme: %s" %
-                   hparams.learning_rate_decay_scheme)
-
-
-def learning_rate_warmup(warmup_steps, warmup_schedule="exp"):
-  """Learning rate warmup multiplier."""
-  if not warmup_steps:
-    return tf.constant(1.)
-
-  tf.logging.info("Applying %s learning rate warmup for %d steps",
-                  warmup_schedule, warmup_steps)
-
-  warmup_steps = tf.to_float(warmup_steps)
-  global_step = tf.to_float(tf.train.get_or_create_global_step())
-
-  if warmup_schedule == "exp":
-    return tf.exp(tf.log(0.01) / warmup_steps)**(warmup_steps - global_step)
-  else:
-    assert warmup_schedule == "linear"
-    start = tf.constant(0.35)
-    return ((tf.constant(1.) - start) / warmup_steps) * global_step + start
-
-
-def learning_rate_decay_with_warmup(hparams, num_worker_replicas=1):
-  """Learning rate decay rate with warmup based on hparams."""
-  warmup_steps = hparams.learning_rate_warmup_steps * num_worker_replicas
-  warmup = learning_rate_warmup(warmup_steps)
-
-  decay = learning_rate_decay(hparams, warmup_steps)
-
-  global_step = tf.train.get_or_create_global_step()
-  return tf.where(global_step < warmup_steps, warmup, decay)
-
-
-def learning_rate_schedule(hparams, num_worker_replicas=1):
-  """Learning rate schedule based on hparams."""
-  schedule = hparams.learning_rate_schedule
-  warmup_steps = tf.to_float(hparams.learning_rate_warmup_steps)
-  global_step = tf.to_float(tf.train.get_or_create_global_step())
-  if hparams.learning_rate_decay_scheme == "noam":
-    # backwards compatiblity with previous behavior
-    schedule = "linear_warmup_rsqrt_decay"
-  if schedule == "warmup_and_decay":
-    return learning_rate_decay_with_warmup(hparams, num_worker_replicas)
-  elif schedule == "linear_warmup_rsqrt_decay":
-    return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
-        (global_step + 1) * warmup_steps**-1.5, (global_step + 1)**-0.5)
-  else:
-    raise ValueError("Unrecognized learning rate schedule: %s" % schedule)
-
-
 def weight_decay_and_noise(loss, hparams, learning_rate, var_list=None):
   """Apply weight decay and weight noise."""
   if var_list is None:
@@ -347,231 +224,3 @@ def get_variable_initializer(hparams):
   else:
     raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
 
-
-class AdafactorOptimizer(tf.train.Optimizer):
-  """Optimizer that implements the Adafactor algorithm.
-
-  Adafactor is similar to RMSProp (ADAM, etc.), but takes advantage of the
-  structure of weight matrices to use less memory and to be more resilient to
-  sudden large gradients.
-
-  The RMSProp algorithm works on each component independently as follows:
-    w -= grad * learning_rate / sqrt(estimated_mean_square_grad)
-
-    learning_rate is the desired update magnitude, and
-    estimated_mean_square_grad is computed by exponential smoothing of the
-    square of the gradient.
-
-  Adafactor addresses two shortcomings of RMSProp:
-
-  1. In RMSProp (ADAM, etc), maintaining estimated_mean_square_grad requires
-     memory equal to the number of parameters.  This can be an impediment to
-     training large models on GPU/TPU systems with limited memory.
-
-     Adafactor uses less memory.
-     For an AxB weight matrix, instead of keeping a full AxB
-     estimated_mean_square_grad matrix, Adafactor keeps only
-     exponentially-smoothed row and column means, and bases its estimates on
-     those means.   Thus the memory requirements drop from `2AB` to `A+B`.
-
-  2. Depending on the decay rate of the exponential smoothing, we run into one
-     of two problems.
-
-     If the decay rate is high (short memory), we see the problem described
-     here - worse final quality:
-       On the Convergence of Adam and Beyond
-       https://openreview.net/forum?id=ryQu7f-RZ
-
-     If the decay rate is low (long memory), then the estimate does not adjust
-     rapidly to suddenly large gradients, and the model diverges.
-     Suddenly large gradients (which we will call anomalies), may happen either
-     due to weird training data, or because the model has just learned something
-     important and can now rush to exploit it.  Momentum (as in ADAM) can help
-     prevent divergence, but it also requires more memory.  Gradient clipping
-     can also help prevent divergence, but it is irritating in that setting
-     the right threshold depends on the knowing the scale of the gradients.
-
-     Adafactor uses a relatively long memory (setting the decay rate to
-     step_num^-0.8), but detects and corrects for anomalies.   An anomaly
-     is detected if the mean-square gradient for the current step
-     (across the entire weight matrix) is much greater than the historical
-     average.  When this occurs, we increase estimated_mean_square_grad
-     for the current step for all weights in the matrix.  Note: it is important
-     to detect anomalies based on entire matrices, rather than individual
-     weights, since any individual weight may legitimately have a pattern
-     of many small gradients and occasional very large ones.
-
-  HYPERPARAMETERS:
-    learning_rate: desired magnitude of variable updates.  a scalar - can be a
-      constant, but more likely should have a warmup and then decay
-      proportionally to rsqrt(step_num)
-    epsilon: 1e-20 - a small floating point value to avoid division by zero.
-    horizon_exponent: 0.8 - a value between 0 and 1 - The effective decay
-      horizon of the second-moment estimator is step_num^horizon_exponent.
-    anomaly_threshold: 2.0 - a value greater than 1.  Suppress anomalies
-      where the mean-square-gradients for a step exceed the long-term average
-      by at least this factor.
-
-  ALGORITHM:
-
-  We initialize
-  ```
-  t <- 0
-  if var is 2-dimensional:
-    v_r <- zeros([num_rows])
-    v_c <- zeros([num_cols])
-  else:
-    v <- zeros(shape(var))
-  ```
-
-  The update rule is as follows:
-  ```
-  t <- t + 1
-  decay_rate = 1 - t ^ (-horizon_exponent)
-  grad_squared = tf.square(grad) + epsilon
-  if var is 2-dimensional:
-    v_r <- decay_rate * v_r + (1 - decay_rate) * reduce_mean(grad_squared, 1)
-    v_c <- decay_rate * v_c + (1 - decay_rate) * reduce_mean(grad_squared, 0)
-    anomaly_factor = max(1.0,
-      reduce_mean(grad_squared) / reduce_mean(v_r) / anomaly_threshold)
-    est_v = anomaly_factor * outer_prod(v_r, v_c) / reduce_mean(v_r)
-  else:
-    v <- decay_rate * v + (1 - decay_rate) * grad_squared
-    anomaly_factor = max(1.0,
-      reduce_mean(grad_squared) / reduce_mean(v) / anomaly_threshold)
-    est_v = v * anomaly_factor
-  var <- var - lr * grad / sqrt(est_v)
-  ```
-  TODO(noam): write a paper.
-  TODO(noam): we should also apply the 2d logic to the two final dimensions.
-    of >2d convolutional kernels.
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               epsilon=1e-20,
-               horizon_exponent=0.8,
-               anomaly_threshold=2.0,
-               use_locking=False,
-               name="Adafactor"):
-    """Construct a new Adafactor optimizer.
-
-    See class comment.
-
-    Args:
-      learning_rate: A Tensor or a floating point value.  The learning rate.
-      epsilon: A small constant for numerical stability.
-      horizon_exponent: a floating point value between 0 and 1
-      anomaly_threshold: a floating point value >= 1.0
-      use_locking: If True use locks for update operations.
-      name: Optional name for the operations created when applying gradients.
-        Defaults to "AdafactorOptimizer".
-    """
-    super(AdafactorOptimizer, self).__init__(use_locking, name)
-    self._lr = learning_rate
-    self._epsilon = epsilon
-    self._horizon_exponent = horizon_exponent
-    self._anomaly_threshold = anomaly_threshold
-
-  def _should_use_factored_second_moment_estimate(self, shape):
-    """Should we use a factored second moment estimator.
-
-    Based on the shape of the variable.
-
-    Args:
-      shape: a list of integers
-    Returns:
-      a boolean
-    """
-    return len(shape) == 2
-
-  def _create_slots(self, var_list):
-    for v in var_list:
-      shape = v.get_shape().as_list()
-      if self._should_use_factored_second_moment_estimate(shape):
-        r_val = tf.zeros([shape[0]], dtype=tf.float32)
-        c_val = tf.zeros([shape[1]], dtype=tf.float32)
-        self._get_or_make_slot(v, r_val, "vr", self._name)
-        self._get_or_make_slot(v, c_val, "vc", self._name)
-      else:
-        self._zeros_slot(v, "v", self._name)
-
-  def _apply_dense(self, grad, var):
-    return self._resource_apply_dense(grad, var)
-
-  def _resource_apply_dense(self, grad, var):
-    grad_squared = tf.square(grad) + self._epsilon
-    grad_squared_mean = tf.reduce_mean(grad_squared)
-    lr = tf.to_float(self._lr)
-    global_step = tf.to_float(tf.train.get_or_create_global_step()) + 1.0
-    # HACK: Make lr and global_step dependent on grad.
-    # This confounds the XLA rewriter and keeps it from fusing computations
-    # across different variables.  This fusion is a bad for HBM usage, since
-    # it causes the gradients to persist in memory.
-    lr += grad_squared_mean * 1e-30
-    global_step += grad_squared_mean * 1e-30
-    # END HACK
-    mixing_rate = tf.pow(global_step, -self._horizon_exponent)
-    decay_rate = 1.0 - mixing_rate
-    shape = var.get_shape().as_list()
-    updates = []
-    if self._should_use_factored_second_moment_estimate(shape):
-      grad_squared_row_mean = tf.reduce_mean(grad_squared, 1)
-      grad_squared_col_mean = tf.reduce_mean(grad_squared, 0)
-      vr = self.get_slot(var, "vr")
-      new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean)
-      vc = self.get_slot(var, "vc")
-      new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean)
-      vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking)
-      vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking)
-      updates = [vr_update, vc_update]
-      long_term_mean = tf.reduce_mean(new_vr)
-      anomaly_factor = self._anomaly_factor(grad_squared_mean, long_term_mean)
-      # This is the computation we should do.
-      # est_v = (tf.expand_dims(new_vr, 1) * tf.expand_dims(new_vc, 0)
-      #          * anomaly_factor / long_term_mean)
-      # subtrahend = grad * lr / tf.sqrt(est_v)
-      # Instead we do the following, which is mathematically equivalent.
-      r_factor = lr * tf.rsqrt(new_vr * anomaly_factor / long_term_mean)
-      c_factor = tf.rsqrt(new_vc)
-      subtrahend = (
-          grad * tf.expand_dims(r_factor, 1) * tf.expand_dims(c_factor, 0))
-    else:
-      v = self.get_slot(var, "v")
-      new_v = decay_rate * v + mixing_rate * grad_squared
-      v_update = tf.assign(v, new_v, use_locking=self._use_locking)
-      updates = [v_update]
-      long_term_mean = tf.reduce_mean(new_v)
-      anomaly_factor = self._anomaly_factor(grad_squared_mean, long_term_mean)
-      # This is the computation we should do.
-      # est_v = (new_v * anomaly_factor)
-      # subtrahend = grad * lr / tf.sqrt(est_v)
-      # Instead we do the following, which is mathematically equivalent.
-      subtrahend = grad * (lr / tf.sqrt(anomaly_factor)) * tf.rsqrt(new_v)
-    var_update = tf.assign_sub(var, subtrahend, use_locking=self._use_locking)
-    updates = [var_update] + updates
-    return tf.group(*updates)
-
-  def _anomaly_factor(self, grad_squared_mean, long_term_mean):
-    """Multiplier for second-moment estimator, due to short-term anomalies.
-
-    A step may have gradients with magnitudes much larger than the long-term
-    average.  This can cause the model to diverge.  In these cases, we want to
-    temoporarily increase the second-moment estimators to reflect that these
-    steps are anomalous.
-
-    It is important to make these calculations on whole weight matrices, rather
-    than on individual parameters, since we want to allow individual parameters
-    to have occasional large updates.
-
-    Args:
-      grad_squared_mean: A scalar.  The mean square gradient on the varaible
-         for the current step.
-      long_term_mean: A scalar.  The mean of the long-term second-moment
-         estimator.
-    Returns:
-      a scalar that should be multiplied into the second-moment-estimator for
-      this step.
-    """
-    ratio = grad_squared_mean / long_term_mean
-    return tf.maximum(1.0, ratio / self._anomaly_threshold)
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 1b4013fbc..085cc821f 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -34,6 +34,7 @@
 from tensor2tensor.utils import beam_search
 from tensor2tensor.utils import decoding
 from tensor2tensor.utils import expert_utils as eu
+from tensor2tensor.utils import learning_rate
 from tensor2tensor.utils import metrics
 from tensor2tensor.utils import optimize
 from tensor2tensor.utils import registry
@@ -238,6 +239,9 @@ def bottom(self, features):
     # Transform the input features
     for key, input_modality in six.iteritems(
         self._problem_hparams.input_modality):
+      if key not in features:
+        tf.logging.warning("Missing feature %s - ignoring." % key)
+        continue
       do_reuse = input_modality.name in all_previous_modalities
       with tf.variable_scope(input_modality.name, reuse=do_reuse):
         log_info("Transforming feature '%s' with %s.bottom", key,
@@ -336,13 +340,7 @@ def loss(self, logits, features):
   def optimize(self, loss, num_async_replicas=1):
     """Return a training op minimizing loss."""
     log_info("Base learning rate: %f", self.hparams.learning_rate)
-    lr = self.hparams.learning_rate
-    decay_rate = optimize.learning_rate_schedule(self.hparams)
-    lr *= decay_rate
-    if self.hparams.learning_rate_minimum:
-      lr_min = float(self.hparams.learning_rate_minimum)
-      log_info("Applying learning rate minimum: %f", lr_min)
-      lr = tf.max(lr, tf.to_float(lr_min))
+    lr = learning_rate.learning_rate_schedule(self.hparams)
     if num_async_replicas > 1:
       log_info("Dividing learning rate by num_async_replicas: %d",
                num_async_replicas)