diff --git a/docs/cloud_mlengine.md b/docs/cloud_mlengine.md index b257fab25..0750f5088 100644 --- a/docs/cloud_mlengine.md +++ b/docs/cloud_mlengine.md @@ -41,8 +41,8 @@ principle work just fine. Contributions/testers welcome. Launching on Cloud ML Engine works with `--t2t_usr_dir` as well as long as the directory is fully self-contained (i.e. the imports only refer to other modules in the directory). If there are additional PyPI dependencies that you need, you -can include a `setup.py` file in your directory (ensure that it uses -`setuptools.find_packages`). +can include a `requirements.txt` file in the directory specified by +`t2t_usr_dir`. # Hyperparameter Tuning diff --git a/docs/new_problem.md b/docs/new_problem.md index 7564e4ad8..fab76d90d 100644 --- a/docs/new_problem.md +++ b/docs/new_problem.md @@ -65,10 +65,10 @@ class PoetryLines(text_problems.Text2TextProblem): # 10% evaluation data return [{ "split": problem.DatasetSplit.TRAIN, - "shards": 90, + "shards": 9, }, { "split": problem.DatasetSplit.EVAL, - "shards": 10, + "shards": 1, }] def generate_samples(self, data_dir, tmp_dir, dataset_split): @@ -133,7 +133,7 @@ pre-existing "training" and "evaluation" sets. If we did, we'd set split. The `dataset_splits` method determines the fraction that goes to each split. The -training data will be generated into 90 files and the evaluation data into 10. +training data will be generated into 9 files and the evaluation data into 1. 90% of the data will be for training. 10% of the data will be for evaluation. ```python @@ -148,10 +148,10 @@ training data will be generated into 90 files and the evaluation data into 10. # 10% evaluation data return [{ "split": problem.DatasetSplit.TRAIN, - "shards": 90, + "shards": 9, }, { "split": problem.DatasetSplit.EVAL, - "shards": 10, + "shards": 1, }] ``` diff --git a/setup.py b/setup.py index f02efdb2d..c30c752dd 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.5.2', + version='1.5.3', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py index 944ef016a..a9ab7177f 100644 --- a/tensor2tensor/bin/t2t_trainer.py +++ b/tensor2tensor/bin/t2t_trainer.py @@ -65,7 +65,7 @@ flags.DEFINE_string("output_dir", "", "Base output directory for run.") flags.DEFINE_string("schedule", "continuous_train_and_eval", "Method of Experiment to run.") - flags.DEFINE_integer("eval_steps", 10000, + flags.DEFINE_integer("eval_steps", 100, "Number of steps in evaluation. By default, eval will " "stop after eval_steps or when it runs through the eval " "dataset once in full, whichever comes first, so this " diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py index 4e72bf505..a23747c01 100644 --- a/tensor2tensor/data_generators/generator_utils.py +++ b/tensor2tensor/data_generators/generator_utils.py @@ -165,6 +165,8 @@ def generate_files(generator, output_filenames, max_cases=None): for writer in writers: writer.close() + tf.logging.info("Generated %s Examples", counter) + def download_report_hook(count, block_size, total_size): """Report hook for download progress. @@ -198,19 +200,22 @@ def maybe_download(directory, filename, uri): """ if not tf.gfile.Exists(directory): tf.logging.info("Creating directory %s" % directory) - os.mkdir(directory) + tf.gfile.MakeDirs(directory) filepath = os.path.join(directory, filename) if not tf.gfile.Exists(filepath): tf.logging.info("Downloading %s to %s" % (uri, filepath)) try: tf.gfile.Copy(uri, filepath) except tf.errors.UnimplementedError: - inprogress_filepath = filepath + ".incomplete" - inprogress_filepath, _ = urllib.urlretrieve( - uri, inprogress_filepath, reporthook=download_report_hook) - # Print newline to clear the carriage return from the download progress - print() - tf.gfile.Rename(inprogress_filepath, filepath) + if uri.startswith("http"): + inprogress_filepath = filepath + ".incomplete" + inprogress_filepath, _ = urllib.urlretrieve( + uri, inprogress_filepath, reporthook=download_report_hook) + # Print newline to clear the carriage return from the download progress + print() + tf.gfile.Rename(inprogress_filepath, filepath) + else: + raise ValueError("Unrecognized URI: " + filepath) statinfo = os.stat(filepath) tf.logging.info("Successfully downloaded %s, %s bytes." % (filename, statinfo.st_size)) @@ -232,7 +237,7 @@ def maybe_download_from_drive(directory, filename, url): """ if not tf.gfile.Exists(directory): tf.logging.info("Creating directory %s" % directory) - os.mkdir(directory) + tf.gfile.MakeDirs(directory) filepath = os.path.join(directory, filename) confirm_token = None if tf.gfile.Exists(filepath): diff --git a/tensor2tensor/data_generators/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py index eea2616b5..22f5d1282 100644 --- a/tensor2tensor/data_generators/ice_parsing.py +++ b/tensor2tensor/data_generators/ice_parsing.py @@ -32,14 +32,10 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder -from tensor2tensor.data_generators import translate +from tensor2tensor.data_generators import text_problems from tensor2tensor.utils import registry -# End-of-sentence marker. -EOS = text_encoder.EOS_ID - - def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, source_vocab_size, target_vocab_size): """Generate source and target data from a single file.""" @@ -51,8 +47,9 @@ def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, data_dir, tmp_dir, filename, 1, prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size) pair_filepath = os.path.join(tmp_dir, filename) - return translate.tabbed_generator(pair_filepath, source_vocab, target_vocab, - EOS) + return text_problems.text2text_generate_encoded( + text_problems.text2text_txt_tab_iterator(pair_filepath), source_vocab, + target_vocab) def tabbed_parsing_character_generator(tmp_dir, train): @@ -60,8 +57,8 @@ def tabbed_parsing_character_generator(tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() filename = "parsing_{0}.pairs".format("train" if train else "dev") pair_filepath = os.path.join(tmp_dir, filename) - return translate.tabbed_generator(pair_filepath, character_vocab, - character_vocab, EOS) + return text_problems.text2text_generate_encoded( + text_problems.text2text_txt_tab_iterator(pair_filepath), character_vocab) @registry.register_problem @@ -114,8 +111,9 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): def hparams(self, defaults, unused_model_hparams): p = defaults source_vocab_size = self._encoders["inputs"].vocab_size - p.input_modality = {"inputs": (registry.Modalities.SYMBOL, - source_vocab_size)} + p.input_modality = { + "inputs": (registry.Modalities.SYMBOL, source_vocab_size) + } p.target_modality = (registry.Modalities.SYMBOL, self.targeted_vocab_size) p.input_space_id = self.input_space_id p.target_space_id = self.target_space_id diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py index fc4d0347e..4f14c1040 100644 --- a/tensor2tensor/data_generators/lm1b.py +++ b/tensor2tensor/data_generators/lm1b.py @@ -151,7 +151,7 @@ class LanguagemodelLm1b32k(text_problems.Text2TextProblem): """A language model on the 1B words corpus.""" @property - def vocab_name(self): + def vocab_filename(self): return "vocab.lm1b.en.%d" % self.approx_vocab_size @property diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index ebcc0697d..fa4fbea96 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -162,7 +162,7 @@ class Problem(object): data_dir. Vocab files are newline-separated files with each line containing a token. The standard convention for the filename is to set it to be - ${Problem.vocab_name}.${Problem.targeted_vocab_size} + ${Problem.vocab_filename}.${Problem.targeted_vocab_size} - Downloads and other files can be written to tmp_dir - If you have a training and dev generator, you can generate the training and dev datasets with @@ -721,6 +721,11 @@ def define_shapes(example): dataset = dataset.repeat() data_files = tf.contrib.slim.parallel_reader.get_data_files( self.filepattern(data_dir, mode)) + # In continuous_train_and_eval when switching between train and + # eval, this input_fn method gets called multiple times and it + # would give you the exact same samples from the last call + # (because the Graph seed is set). So this skip gives you some + # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map( diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 985a93b30..fa057ade9 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -282,11 +282,12 @@ def _init_vocab_from_file(self, filename): Args: filename: The file to load vocabulary from. """ + with tf.gfile.Open(filename) as f: + tokens = [token.strip() for token in f.readlines()] + def token_gen(): - with tf.gfile.Open(filename) as f: - for line in f: - token = line.strip() - yield token + for token in tokens: + yield token self._init_vocab(token_gen(), add_reserved_tokens=False) @@ -379,7 +380,7 @@ def match(m): try: return six.unichr(int(m.group(1))) except (ValueError, OverflowError) as _: - return u"\u3013" + return u"\u3013" # Unicode for undefined character. trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token return _UNESCAPE_REGEX.sub(match, trimmed) @@ -827,11 +828,9 @@ def _load_from_file_object(self, f): self._init_alphabet_from_tokens(subtoken_strings) def _load_from_file(self, filename): - """Load from a file. - - Args: - filename: Filename to load vocabulary from - """ + """Load from a vocab file.""" + if not tf.gfile.Exists(filename): + raise ValueError("File %s not found" % filename) with tf.gfile.Open(filename) as f: self._load_from_file_object(f) diff --git a/tensor2tensor/data_generators/text_problems.py b/tensor2tensor/data_generators/text_problems.py index 4184c974d..73e6bf4c7 100644 --- a/tensor2tensor/data_generators/text_problems.py +++ b/tensor2tensor/data_generators/text_problems.py @@ -222,15 +222,8 @@ def _maybe_pack_examples(self, generator): def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): generator = self.generate_samples(data_dir, tmp_dir, dataset_split) encoder = self.get_or_create_vocab(data_dir, tmp_dir) - for sample in generator: - targets = encoder.encode(sample["targets"]) - targets.append(text_encoder.EOS_ID) - encoded_sample = {"targets": targets} - if self.has_inputs: - inputs = encoder.encode(sample["inputs"]) - inputs.append(text_encoder.EOS_ID) - encoded_sample["inputs"] = inputs - yield encoded_sample + return text2text_generate_encoded(generator, encoder, + has_inputs=self.has_inputs) @property def batch_size_means_tokens(self): @@ -244,15 +237,15 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): problem.DatasetSplit.TEST: self.test_filepaths, } - split_paths = dict([(split["split"], filepath_fns[split["split"]]( + split_paths = [(split["split"], filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=False)) - for split in self.dataset_splits]) + for split in self.dataset_splits] all_paths = [] - for paths in split_paths.values(): + for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: - for split, paths in split_paths.items(): + for split, paths in split_paths: generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples(data_dir, tmp_dir, split)), paths) @@ -418,8 +411,7 @@ def example_reading_spec(self): def txt_line_iterator(txt_path): """Iterate through lines of file.""" with tf.gfile.Open(txt_path) as f: - readline = lambda: f.readline() - for line in iter(readline, ""): + for line in f: yield line.strip() @@ -472,11 +464,26 @@ def text2text_txt_tab_iterator(txt_path): """ for line in txt_line_iterator(txt_path): if line and "\t" in line: - parts = line.split("\t") + parts = line.split("\t", 1) inputs, targets = parts[:2] yield {"inputs": inputs.strip(), "targets": targets.strip()} +def text2text_generate_encoded(sample_generator, + vocab, + targets_vocab=None, + has_inputs=True): + """Encode Text2Text samples from the generator with the vocab.""" + targets_vocab = targets_vocab or vocab + for sample in sample_generator: + if has_inputs: + sample["inputs"] = vocab.encode(sample["inputs"]) + sample["inputs"].append(text_encoder.EOS_ID) + sample["targets"] = targets_vocab.encode(sample["targets"]) + sample["targets"].append(text_encoder.EOS_ID) + yield sample + + @registry.register_problem class Text2textTmpdir(Text2TextProblem): """Allows training a Text2TextProblem without defining a subclass. diff --git a/tensor2tensor/data_generators/translate.py b/tensor2tensor/data_generators/translate.py index bf3196144..435d1dfe2 100644 --- a/tensor2tensor/data_generators/translate.py +++ b/tensor2tensor/data_generators/translate.py @@ -52,9 +52,8 @@ def vocab_data_files(self): return self.source_data_files(problem.DatasetSplit.TRAIN) def generate_samples(self, data_dir, tmp_dir, dataset_split): - train = dataset_split == problem.DatasetSplit.TRAIN datasets = self.source_data_files(dataset_split) - tag = "train" if train else "dev" + tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) @@ -67,127 +66,6 @@ def generate_samples(self, data_dir, tmp_dir, dataset_split): data_path + ".lang2") -# Generic generators used later for multiple problems. - - -def character_generator(source_path, target_path, character_vocab, eos=None): - """Generator for sequence-to-sequence tasks that just uses characters. - - This generator assumes the files at source_path and target_path have - the same number of lines and yields dictionaries of "inputs" and "targets" - where inputs are characters from the source lines converted to integers, - and targets are characters from the target lines, also converted to integers. - - Args: - source_path: path to the file with source sentences. - target_path: path to the file with target sentences. - character_vocab: a TextEncoder to encode the characters. - eos: integer to append at the end of each sequence (default: None). - Yields: - A dictionary {"inputs": source-line, "targets": target-line} where - the lines are integer lists converted from characters in the file lines. - """ - eos_list = [] if eos is None else [eos] - with tf.gfile.GFile(source_path, mode="r") as source_file: - with tf.gfile.GFile(target_path, mode="r") as target_file: - source, target = source_file.readline(), target_file.readline() - while source and target: - source_ints = character_vocab.encode(source.strip()) + eos_list - target_ints = character_vocab.encode(target.strip()) + eos_list - yield {"inputs": source_ints, "targets": target_ints} - source, target = source_file.readline(), target_file.readline() - - -def tabbed_generator(source_path, source_vocab, target_vocab, eos=None): - r"""Generator for sequence-to-sequence tasks using tabbed files. - - Tokens are derived from text files where each line contains both - a source and a target string. The two strings are separated by a tab - character ('\t'). It yields dictionaries of "inputs" and "targets" where - inputs are characters from the source lines converted to integers, and - targets are characters from the target lines, also converted to integers. - - Args: - source_path: path to the file with source and target sentences. - source_vocab: a SubwordTextEncoder to encode the source string. - target_vocab: a SubwordTextEncoder to encode the target string. - eos: integer to append at the end of each sequence (default: None). - Yields: - A dictionary {"inputs": source-line, "targets": target-line} where - the lines are integer lists converted from characters in the file lines. - """ - eos_list = [] if eos is None else [eos] - with tf.gfile.GFile(source_path, mode="r") as source_file: - for line in source_file: - if line and "\t" in line: - parts = line.split("\t", 1) - source, target = parts[0].strip(), parts[1].strip() - source_ints = source_vocab.encode(source) + eos_list - target_ints = target_vocab.encode(target) + eos_list - yield {"inputs": source_ints, "targets": target_ints} - - -def token_generator(source_path, target_path, token_vocab, eos=None): - """Generator for sequence-to-sequence tasks that uses tokens. - - This generator assumes the files at source_path and target_path have - the same number of lines and yields dictionaries of "inputs" and "targets" - where inputs are token ids from the " "-split source (and target, resp.) lines - converted to integers using the token_map. - - Args: - source_path: path to the file with source sentences. - target_path: path to the file with target sentences. - token_vocab: text_encoder.TextEncoder object. - eos: integer to append at the end of each sequence (default: None). - Yields: - A dictionary {"inputs": source-line, "targets": target-line} where - the lines are integer lists converted from tokens in the file lines. - """ - eos_list = [] if eos is None else [eos] - with tf.gfile.GFile(source_path, mode="r") as source_file: - with tf.gfile.GFile(target_path, mode="r") as target_file: - source, target = source_file.readline(), target_file.readline() - while source and target: - source_ints = token_vocab.encode(source.strip()) + eos_list - target_ints = token_vocab.encode(target.strip()) + eos_list - yield {"inputs": source_ints, "targets": target_ints} - source, target = source_file.readline(), target_file.readline() - - -def bi_vocabs_token_generator(source_path, - target_path, - source_token_vocab, - target_token_vocab, - eos=None): - """Generator for sequence-to-sequence tasks that uses tokens. - - This generator assumes the files at source_path and target_path have - the same number of lines and yields dictionaries of "inputs" and "targets" - where inputs are token ids from the " "-split source (and target, resp.) lines - converted to integers using the token_map. - - Args: - source_path: path to the file with source sentences. - target_path: path to the file with target sentences. - source_token_vocab: text_encoder.TextEncoder object. - target_token_vocab: text_encoder.TextEncoder object. - eos: integer to append at the end of each sequence (default: None). - Yields: - A dictionary {"inputs": source-line, "targets": target-line} where - the lines are integer lists converted from tokens in the file lines. - """ - eos_list = [] if eos is None else [eos] - with tf.gfile.GFile(source_path, mode="r") as source_file: - with tf.gfile.GFile(target_path, mode="r") as target_file: - source, target = source_file.readline(), target_file.readline() - while source and target: - source_ints = source_token_vocab.encode(source.strip()) + eos_list - target_ints = target_token_vocab.encode(target.strip()) + eos_list - yield {"inputs": source_ints, "targets": target_ints} - source, target = source_file.readline(), target_file.readline() - - def _preprocess_sgm(line, is_sgm): """Preprocessing to strip tags in SGM files.""" if not is_sgm: @@ -209,14 +87,19 @@ def _preprocess_sgm(line, is_sgm): def compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) - with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_resfile: - with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_resfile: + lang1_fname = filename + ".lang1" + lang2_fname = filename + ".lang2" + if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname): + tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, + lang2_fname) + with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile: + with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile: for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) - - generator_utils.maybe_download(tmp_dir, compressed_filename, url) + if url.startswith("http"): + generator_utils.maybe_download(tmp_dir, compressed_filename, url) if dataset[1][0] == "tsv": _, src_column, trg_column, glob_pattern = dataset[1] @@ -232,13 +115,17 @@ def compile_data(tmp_dir, datasets, filename): new_filename = tsv_filename.strip(".gz") generator_utils.gunzip_file(tsv_filename, new_filename) tsv_filename = new_filename - with tf.gfile.GFile(tsv_filename, mode="r") as tsv_file: + with tf.gfile.Open(tsv_filename) as tsv_file: for line in tsv_file: if line and "\t" in line: parts = line.split("\t") source, target = parts[src_column], parts[trg_column] - lang1_resfile.write(source.strip() + "\n") - lang2_resfile.write(target.strip() + "\n") + source, target = source.strip(), target.strip() + if source and target: + lang1_resfile.write(source) + lang1_resfile.write("\n") + lang2_resfile.write(target) + lang2_resfile.write("\n") else: lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) @@ -246,8 +133,8 @@ def compile_data(tmp_dir, datasets, filename): is_sgm = ( lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) - if not (os.path.exists(lang1_filepath) and - os.path.exists(lang2_filepath)): + if not (tf.gfile.Exists(lang1_filepath) and + tf.gfile.Exists(lang2_filepath)): # For .tar.gz and .tgz files, we read compressed. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: @@ -260,15 +147,15 @@ def compile_data(tmp_dir, datasets, filename): new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath - with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file: - with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file: - line1, line2 = lang1_file.readline(), lang2_file.readline() - while line1 or line2: - line1res = _preprocess_sgm(line1, is_sgm) - line2res = _preprocess_sgm(line2, is_sgm) - if line1res or line2res: - lang1_resfile.write(line1res.strip() + "\n") - lang2_resfile.write(line2res.strip() + "\n") - line1, line2 = lang1_file.readline(), lang2_file.readline() + + for example in text_problems.text2text_txt_iterator( + lang1_filepath, lang2_filepath): + line1res = _preprocess_sgm(example["inputs"], is_sgm) + line2res = _preprocess_sgm(example["targets"], is_sgm) + if line1res and line2res: + lang1_resfile.write(line1res) + lang1_resfile.write("\n") + lang2_resfile.write(line2res) + lang2_resfile.write("\n") return filename diff --git a/tensor2tensor/data_generators/translate_ende.py b/tensor2tensor/data_generators/translate_ende.py index b493ec5c9..2a1e52c2f 100644 --- a/tensor2tensor/data_generators/translate_ende.py +++ b/tensor2tensor/data_generators/translate_ende.py @@ -142,6 +142,14 @@ def packed_length(self): return 256 +@registry.register_problem +class TranslateEndeWmt8kPacked(TranslateEndeWmt8k): + + @property + def packed_length(self): + return 256 + + @registry.register_problem class TranslateEndeWmtCharacters(translate.TranslateProblem): """Problem spec for WMT En-De translation.""" diff --git a/tensor2tensor/data_generators/translate_enzh.py b/tensor2tensor/data_generators/translate_enzh.py index 01f9d8fc1..444fc9834 100644 --- a/tensor2tensor/data_generators/translate_enzh.py +++ b/tensor2tensor/data_generators/translate_enzh.py @@ -26,6 +26,7 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems from tensor2tensor.data_generators import translate from tensor2tensor.utils import registry @@ -47,9 +48,11 @@ # This dataset is only a small fraction of full WMT17 task _NC_TRAIN_DATASETS = [[ "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12" - ".tgz", - ["training/news-commentary-v12.zh-en.en", - "training/news-commentary-v12.zh-en.zh"]]] + ".tgz", [ + "training/news-commentary-v12.zh-en.en", + "training/news-commentary-v12.zh-en.zh" + ] +]] # Test set from News Commentary. 2000 lines _NC_TEST_DATASETS = [[ @@ -65,8 +68,8 @@ # place into tmp directory e.g. /tmp/t2t_datagen/dataset.tgz _UN_TRAIN_DATASETS = [[ "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/UNv1.0.en-zh.tar" - ".gz", - ["en-zh/UNv1.0.en-zh.en", "en-zh/UNv1.0.en-zh.zh"]]] + ".gz", ["en-zh/UNv1.0.en-zh.en", "en-zh/UNv1.0.en-zh.zh"] +]] # CWMT corpus # Visit source website to download manually: @@ -81,57 +84,79 @@ # NOTE: You need to register to download dataset from official source # place into tmp directory e.g. /tmp/t2t_datagen/dataset.tgz -_CWMT_TRAIN_DATASETS = [ - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/casia2015/casia2015_en.txt", "cwmt/casia2015/casia2015_ch.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/casict2015/casict2015_en.txt", - "cwmt/casict2015/casict2015_ch.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/neu2017/NEU_en.txt", "cwmt/neu2017/NEU_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2015/datum_en.txt", "cwmt/datum2015/datum_ch.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book1_en.txt", "cwmt/datum2017/Book1_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book2_en.txt", "cwmt/datum2017/Book2_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book3_en.txt", "cwmt/datum2017/Book3_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book4_en.txt", "cwmt/datum2017/Book4_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book5_en.txt", "cwmt/datum2017/Book5_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book6_en.txt", "cwmt/datum2017/Book6_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book7_en.txt", "cwmt/datum2017/Book7_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book8_en.txt", "cwmt/datum2017/Book8_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book9_en.txt", "cwmt/datum2017/Book9_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book10_en.txt", "cwmt/datum2017/Book10_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book11_en.txt", "cwmt/datum2017/Book11_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book12_en.txt", "cwmt/datum2017/Book12_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book13_en.txt", "cwmt/datum2017/Book13_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book14_en.txt", "cwmt/datum2017/Book14_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book15_en.txt", "cwmt/datum2017/Book15_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book16_en.txt", "cwmt/datum2017/Book16_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book17_en.txt", "cwmt/datum2017/Book17_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book18_en.txt", "cwmt/datum2017/Book18_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book19_en.txt", "cwmt/datum2017/Book19_cn.txt"]], - ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", - ["cwmt/datum2017/Book20_en.txt", "cwmt/datum2017/Book20_cn.txt"]] -] +_CWMT_TRAIN_DATASETS = [[ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/casia2015/casia2015_en.txt", "cwmt/casia2015/casia2015_ch.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/casict2015/casict2015_en.txt", "cwmt/casict2015/casict2015_ch.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/neu2017/NEU_en.txt", "cwmt/neu2017/NEU_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2015/datum_en.txt", "cwmt/datum2015/datum_ch.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book1_en.txt", "cwmt/datum2017/Book1_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book2_en.txt", "cwmt/datum2017/Book2_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book3_en.txt", "cwmt/datum2017/Book3_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book4_en.txt", "cwmt/datum2017/Book4_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book5_en.txt", "cwmt/datum2017/Book5_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book6_en.txt", "cwmt/datum2017/Book6_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book7_en.txt", "cwmt/datum2017/Book7_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book8_en.txt", "cwmt/datum2017/Book8_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book9_en.txt", "cwmt/datum2017/Book9_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book10_en.txt", "cwmt/datum2017/Book10_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book11_en.txt", "cwmt/datum2017/Book11_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book12_en.txt", "cwmt/datum2017/Book12_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book13_en.txt", "cwmt/datum2017/Book13_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book14_en.txt", "cwmt/datum2017/Book14_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book15_en.txt", "cwmt/datum2017/Book15_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book16_en.txt", "cwmt/datum2017/Book16_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book17_en.txt", "cwmt/datum2017/Book17_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book18_en.txt", "cwmt/datum2017/Book18_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book19_en.txt", "cwmt/datum2017/Book19_cn.txt"] +], [ + "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz", + ["cwmt/datum2017/Book20_en.txt", "cwmt/datum2017/Book20_cn.txt"] +]] def get_filename(dataset): @@ -215,9 +240,10 @@ def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) - return translate.bi_vocabs_token_generator(data_path + ".lang1", - data_path + ".lang2", - source_vocab, target_vocab, EOS) + return text_problems.text2text_generate_encoded( + text_problems.text2text_txt_iterator(data_path + ".lang1", + data_path + ".lang2"), + source_vocab, target_vocab) def feature_encoders(self, data_dir): source_vocab_filename = os.path.join(data_dir, self.source_vocab_name) diff --git a/tensor2tensor/data_generators/translate_test.py b/tensor2tensor/data_generators/translate_test.py index f28b47818..201898352 100644 --- a/tensor2tensor/data_generators/translate_test.py +++ b/tensor2tensor/data_generators/translate_test.py @@ -19,64 +19,71 @@ from __future__ import division from __future__ import print_function -import io import os -import tempfile +import shutil +import tarfile # Dependency imports -import six -from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems from tensor2tensor.data_generators import translate import tensorflow as tf class TranslateTest(tf.test.TestCase): + DATASETS = [ + ["data1.tgz", ("train1.en", "train1.de")], + ["data2.tgz", ("train2.en", "train2.de")], + ["data3.tgz", ("train3.en", "train3.de")], + ] - def testCharacterGenerator(self): - # Generate a trivial source and target file. - tmp_dir = self.get_temp_dir() - (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) - if six.PY2: - enc_f = lambda s: s - else: - enc_f = lambda s: s.encode("utf-8") - with io.open(tmp_file_path + ".src", "wb") as src_file: - src_file.write(enc_f("source1\n")) - src_file.write(enc_f("source2\n")) - with io.open(tmp_file_path + ".tgt", "wb") as tgt_file: - tgt_file.write(enc_f("target1\n")) - tgt_file.write(enc_f("target2\n")) - - # Call character generator on the generated files. - results_src, results_tgt = [], [] - character_vocab = text_encoder.ByteTextEncoder() - for dictionary in translate.character_generator( - tmp_file_path + ".src", tmp_file_path + ".tgt", character_vocab): - self.assertEqual(sorted(list(dictionary)), ["inputs", "targets"]) - results_src.append(dictionary["inputs"]) - results_tgt.append(dictionary["targets"]) - - # Check that the results match the files. - # First check that the results match the encoded original strings; - # this is a comparison of integer arrays. - self.assertEqual(len(results_src), 2) - self.assertEqual(results_src[0], character_vocab.encode("source1")) - self.assertEqual(results_src[1], character_vocab.encode("source2")) - self.assertEqual(results_tgt[0], character_vocab.encode("target1")) - self.assertEqual(results_tgt[1], character_vocab.encode("target2")) - # Then decode the results and compare with the original strings; - # this is a comparison of strings - self.assertEqual(character_vocab.decode(results_src[0]), "source1") - self.assertEqual(character_vocab.decode(results_src[1]), "source2") - self.assertEqual(character_vocab.decode(results_tgt[0]), "target1") - self.assertEqual(character_vocab.decode(results_tgt[1]), "target2") - - # Clean up. - os.remove(tmp_file_path + ".src") - os.remove(tmp_file_path + ".tgt") - os.remove(tmp_file_path) + @classmethod + def setUpClass(cls): + tmp_dir = tf.test.get_temp_dir() + compressed_dir = os.path.join(tmp_dir, "compressed") + shutil.rmtree(tmp_dir) + tf.gfile.MakeDirs(compressed_dir) + + en_data = [str(i) for i in range(10, 40)] + de_data = [str(i) for i in range(100, 130)] + data = list(zip(en_data, de_data)) + + for i, dataset in enumerate(cls.DATASETS): + tar_file = dataset[0] + en_file, de_file = [ + os.path.join(compressed_dir, name) for name in dataset[1] + ] + with tf.gfile.Open(en_file, "w") as en_f: + with tf.gfile.Open(de_file, "w") as de_f: + start = i * 10 + end = start + 10 + for en_line, de_line in data[start:end]: + en_f.write(en_line) + en_f.write("\n") + de_f.write(de_line) + de_f.write("\n") + + with tarfile.open(os.path.join(tmp_dir, tar_file), "w:gz") as tar_f: + tar_f.add(en_file, os.path.basename(en_file)) + tar_f.add(de_file, os.path.basename(de_file)) + + cls.tmp_dir = tmp_dir + cls.data = data + + def testCompileData(self): + filename = "out" + filepath = os.path.join(self.tmp_dir, filename) + translate.compile_data(self.tmp_dir, self.DATASETS, filename) + + count = 0 + for i, example in enumerate( + text_problems.text2text_txt_iterator(filepath + ".lang1", + filepath + ".lang2")): + expected = self.data[i] + self.assertEqual(list(expected), [example["inputs"], example["targets"]]) + count += 1 + self.assertEqual(count, len(self.data)) if __name__ == "__main__": diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py index 33a77b746..9909a1267 100644 --- a/tensor2tensor/data_generators/wiki.py +++ b/tensor2tensor/data_generators/wiki.py @@ -223,7 +223,7 @@ class LanguagemodelWikiNorefV8kL1k(LanguagemodelWikiXmlV8kL1k): """ @property - def vocab_name(self): + def vocab_filename(self): return "vocab.wiki_noref" def filepath_to_unicode_text(self, filepath): diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index d567016a5..a9346c34d 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -1510,7 +1510,6 @@ def masked_local_attention_1d(q, k, v, block_length=128, """ with tf.variable_scope( name, default_name="local_attention_1d", values=[q, k, v]): - v_shape = v.get_shape() batch = common_layers.shape_list(q)[0] heads = common_layers.shape_list(q)[1] length = common_layers.shape_list(q)[2] @@ -1534,7 +1533,11 @@ def masked_local_attention_1d(q, k, v, block_length=128, q = tf.pad(q, padding) k = tf.pad(k, padding) v = tf.pad(v, padding) - num_blocks = tf.div(length, block_length) + + if isinstance(length, int) and isinstance(block_length, int): + num_blocks = length // block_length + else: + num_blocks = tf.div(length, block_length) # compute attention for the first query block. first_q = tf.slice(q, [0, 0, 0, 0], [-1, -1, block_length, -1]) @@ -1553,17 +1556,21 @@ def masked_local_attention_1d(q, k, v, block_length=128, k = tf.reshape(k, [batch, heads, num_blocks, block_length, depth_k]) v = tf.reshape(v, [batch, heads, num_blocks, block_length, depth_v]) - def local(x): + def local(x, depth): """Create a local version of the keys or values.""" prev_block = tf.slice(x, [0, 0, 0, 0, 0], [-1, -1, num_blocks - 1, -1, -1]) cur_block = tf.slice(x, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1]) - return tf.concat([prev_block, cur_block], 3) + local_block = tf.concat([prev_block, cur_block], 3) + return tf.reshape(local_block, + [batch, heads, num_blocks - 1, + block_length * 2, depth]) - local_k = local(k) - local_v = local(v) + local_k = local(k, depth_k) + local_v = local(v, depth_v) tail_q = tf.slice(q, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1]) - + tail_q = tf.reshape(tail_q, [batch, heads, num_blocks - 1, + block_length, depth_k]) local_length = common_layers.shape_list(local_k)[3] # [batch, heads, num_blocks - 1, block_length, local_length] @@ -1579,10 +1586,11 @@ def local(x): # The naive way currently causes errors due to empty tensors. # output: [batch, heads, num_blocks-1, block_length, depth_v] output = tf.matmul(attention, local_v) - output = tf.reshape(output, [batch, heads, -1, depth_v]) + output = tf.reshape(output, [ + batch, heads, (num_blocks-1)*block_length, depth_v]) output = tf.concat([first_output, output], axis=2) output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1]) - output.set_shape(v_shape) + output = tf.reshape(output, [batch, heads, original_length, depth_v]) return output diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py index 159006b9b..c4c1cf885 100644 --- a/tensor2tensor/layers/common_hparams.py +++ b/tensor2tensor/layers/common_hparams.py @@ -61,13 +61,25 @@ def basic_params1(): optimizer_adam_beta2=0.997, optimizer_momentum_momentum=0.9, optimizer_momentum_nesterov=False, + optimizer_adafactor_beta1=0.0, + optimizer_adafactor_beta2=0.999, + optimizer_adafactor_factored=True, + optimizer_adafactor_decay_type="pow", + optimizer_adafactor_memory_exponent=0.8, + optimizer_adafactor_clipping_threshold=1.0, + optimizer_adafactor_multiply_by_parameter_scale=True, weight_decay=1e-6, weight_noise=0.0, - learning_rate_schedule="warmup_and_decay", - # If learning_rate_schedule=="warmup_and_decay", then this specifies - # the decay part of the schedule. - # The warmup is always exponential. - # TODO(noam): add a hyperparameter to control the warmup. + # Defines the learning rate as a product of named functions. + # Available functions are listed in learning_rate._LEARNING_RATE_FUNCTIONS + # e.g. "constant*linear_warmup*rsqrt_decay*rsqrt_hidden_size" + learning_rate_schedule="legacy", + learning_rate_constant=1.0, + # If learning_rate_schedule=="legacy", + # then we specify decay scheme here. Warmup is always exponential, + # except with "noam" learning rate decay scheme. + # see optimize.legacy_learning_rate_schedule() + # TODO(noam): migrate everyone away from this. learning_rate_decay_scheme="none", # decay_steps and decay_staircase for learning_rate_decay_scheme=="exp" learning_rate_decay_steps=5000, diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py index f830ac977..0e7ac4a4e 100644 --- a/tensor2tensor/layers/common_image_attention.py +++ b/tensor2tensor/layers/common_image_attention.py @@ -45,14 +45,13 @@ def get_choices(): ] -def maybe_reshape_4d_to_3d(x, hparams): +def maybe_reshape_4d_to_3d(x): """Reshape input from 4D to 3D if necessary.""" x_shape = common_layers.shape_list(x) is_4d = False if len(x_shape) == 4: x = tf.reshape(x, [x_shape[0], x_shape[1]*x_shape[2], x_shape[3]]) is_4d = True - x.set_shape([None, None, hparams.hidden_size]) return x, x_shape, is_4d @@ -82,7 +81,7 @@ def local_attention_1d(x, kv_padding="VALID"): """Local 1d self attention.""" # self-attention - x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams) + x, x_shape, is_4d = maybe_reshape_4d_to_3d(x) with tf.variable_scope("local_1d_self_att"): y = common_attention.multihead_attention( x, @@ -104,7 +103,6 @@ def local_attention_1d(x, name="self_attention") if is_4d: y = tf.reshape(y, x_shape) - y.set_shape([None, None, None, hparams.hidden_size]) return y @@ -117,7 +115,7 @@ def dilated_attention_1d(x, gap_size=2): """Dilated 1d self attention.""" # self-attention - x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams) + x, x_shape, is_4d = maybe_reshape_4d_to_3d(x) with tf.variable_scope("masked_dilated_1d"): y = common_attention.multihead_attention( x, @@ -195,7 +193,7 @@ def full_self_attention(x, q_padding="LEFT", kv_padding="LEFT"): """Full self-attention layer.""" - x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams) + x, x_shape, is_4d = maybe_reshape_4d_to_3d(x) with tf.variable_scope("self_att"): y = common_attention.multihead_attention( x, @@ -221,8 +219,8 @@ def encdec_attention_1d(x, encoder_output, hparams): """Local 1d self attention.""" - x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams) - encoder_output, _, _ = maybe_reshape_4d_to_3d(encoder_output, hparams) + x, x_shape, is_4d = maybe_reshape_4d_to_3d(x) + encoder_output, _, _ = maybe_reshape_4d_to_3d(encoder_output) with tf.variable_scope("encdec_attention"): # Encoder Decoder attention y = common_attention.multihead_attention( @@ -518,11 +516,12 @@ def prepare_decoder(targets, hparams): x = add_pos_signals(x, hparams, "dec_pos") else: # Add position signals - x = tf.reshape(x, [-1, x_shape[1]*x_shape[2], hparams.hidden_size]) + x = tf.reshape(x, [targets_shape[0], + x_shape[1]*x_shape[2], hparams.hidden_size]) x = common_layers.shift_right_3d(x) - x = tf.reshape(x, [-1, x_shape[1], x_shape[2], hparams.hidden_size]) + x = tf.reshape(x, [targets_shape[0], + x_shape[1], x_shape[2], hparams.hidden_size]) x = add_pos_signals(x, hparams, "dec_pos") - x.set_shape([None, None, None, hparams.hidden_size]) return x, x_shape[1], x_shape[2], bias diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py index 1bfc97248..87d60e911 100644 --- a/tensor2tensor/layers/common_layers.py +++ b/tensor2tensor/layers/common_layers.py @@ -216,11 +216,12 @@ def dropout_no_scaling(x, keep_prob): def embedding(x, vocab_size, dense_size, name=None, reuse=None, multiplier=1.0, - symbol_dropout_rate=0.0): + symbol_dropout_rate=0.0, embedding_var=None): """Embed x of type int64 into dense vectors, reducing to max 4 dimensions.""" with tf.variable_scope( name, default_name="embedding", values=[x], reuse=reuse): - embedding_var = tf.get_variable("kernel", [vocab_size, dense_size]) + if embedding_var is None: + embedding_var = tf.get_variable("kernel", [vocab_size, dense_size]) # On the backwards pass, we want to convert the gradient from # an indexed-slices to a regular tensor before sending it back to the # parameter server. This avoids excess computation on the parameter server. @@ -2388,7 +2389,7 @@ def ones_matrix_band_part(rows, cols, num_lower, num_upper, out_shape=None): num_lower = rows - 1 if num_upper < 0: num_upper = cols - 1 - lower_mask = np.tri(rows, cols, num_lower).T + lower_mask = np.tri(cols, rows, num_lower).T upper_mask = np.tri(rows, cols, num_upper) band = np.ones((rows, cols)) * lower_mask * upper_mask if out_shape: diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py new file mode 100644 index 000000000..ccb00ab6b --- /dev/null +++ b/tensor2tensor/layers/discretization.py @@ -0,0 +1,589 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Discretization bottlenecks used to train discrete latent variables. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from functools import partial +# Dependency imports +from tensor2tensor.layers import common_layers +import tensorflow as tf +from tensorflow.python.training import moving_averages + + +def project_hidden(x, projection_tensors, hidden_size, num_blocks): + """Project encoder hidden state into block_dim using projection tensors. + + Args: + x: Encoder hidden state of shape [-1, hidden_size]. + projection_tensors: Projection tensors used to project the hidden state. + hidden_size: Dimension of the latent space. + num_blocks: Number of blocks in DVQ. + + Returns: + Projected states of shape [-1, num_blocks, block_dim]. + """ + x = tf.reshape(x, shape=[1, -1, hidden_size]) + x_tiled = tf.reshape( + tf.tile(x, multiples=[num_blocks, 1, 1]), + shape=[num_blocks, -1, hidden_size]) + x_projected = tf.matmul(x_tiled, projection_tensors) + x_projected = tf.transpose(x_projected, perm=[1, 0, 2]) + return x_projected + + +def slice_hidden(x, hidden_size, num_blocks): + """Slice encoder hidden state into block_dim. + + Args: + x: Encoder hidden state of shape [-1, hidden_size]. + hidden_size: Dimension of the latent space. + num_blocks: Number of blocks in DVQ. + + Returns: + Sliced states of shape [-1, num_blocks, block_dim]. + """ + block_dim = int(hidden_size // num_blocks) + x_sliced = tf.reshape(x, shape=[-1, num_blocks, block_dim]) + return x_sliced + + +def nearest_neighbor(x, means, block_v_size, random_top_k=1): + """Find the nearest element in means to elements in x. + + Args: + x: Batch of encoder continuous latent states sliced/projected into shape + [-1, num_blocks, block_dim]. + means: Embedding table of shpae [num_blocks, block_v_size, block_dim]. + block_v_size: Number of table entries per block. + random_top_k: Noisy top-k if this is bigger than 1 (Default: 1). + + Returns: + Tensor with nearest element in mean encoded in one-hot notation. + """ + x_norm_sq = tf.reduce_sum(tf.square(x), axis=-1, keep_dims=True) + means_norm_sq = tf.reduce_sum(tf.square(means), axis=-1, keep_dims=True) + scalar_prod = tf.matmul( + tf.transpose(x, perm=[1, 0, 2]), tf.transpose(means, perm=[0, 2, 1])) + scalar_prod = tf.transpose(scalar_prod, perm=[1, 0, 2]) + dist = x_norm_sq + tf.transpose( + means_norm_sq, perm=[2, 0, 1]) - 2 * scalar_prod + if random_top_k > 1: + _, top_k_idx = tf.nn.top_k(-dist, k=random_top_k) + nearest_idx = tf.gather( + top_k_idx, + tf.random_uniform( + [1], minval=0, maxval=random_top_k - 1, dtype=tf.int32), + axis=-1) + else: + nearest_idx = tf.argmax(-dist, axis=-1) + nearest_hot = tf.one_hot(nearest_idx, block_v_size) + return tf.stop_gradient(nearest_hot) + + +def embedding_lookup(x, means, num_blocks, block_v_size, random_top_k=1): + """Compute nearest neighbors and loss for training the embeddings via DVQ. + + Args: + x: Batch of encoder continuous latent states sliced/projected into shape + [-1, num_blocks, block_dim]. + means: Embedding table of shape [num_blocks, block_v_size, block_dim]. + num_blocks: Number of blocks in DVQ. + block_v_size: Number of table entries per block. + random_top_k: Noisy top-k if this is bigger than 1 (Default: 1). + + Returns: + The nearest neighbor in one hot form, the nearest neighbor itself, the + commitment loss, embedding training loss. + """ + x_means_hot = nearest_neighbor(x, means, block_v_size, random_top_k) + x_means_hot_flat = tf.reshape(x_means_hot, [-1, num_blocks, block_v_size]) + x_means = tf.matmul(tf.transpose(x_means_hot_flat, perm=[1, 0, 2]), means) + x_means = tf.transpose(x_means, [1, 0, 2]) + q_loss = tf.reduce_mean(tf.square((tf.stop_gradient(x) - x_means))) + e_loss = tf.reduce_mean(tf.square(x - tf.stop_gradient(x_means))) + return x_means_hot, x_means, q_loss, e_loss + + +def bit_to_int(x_bit, num_bits, base=2): + """Turn x_bit representing numbers bitwise (lower-endian) to int tensor. + + Args: + x_bit: Tensor containing numbers in a particular base to be converted to + int. + num_bits: Number of bits in the representation. + base: Base of the representation. + + Returns: + Integer representation of this number. + """ + x_l = tf.stop_gradient(tf.to_int32(tf.reshape(x_bit, [-1, num_bits]))) + x_labels = [] + for i in range(num_bits): + x_labels.append(x_l[:, i] * tf.to_int32(base)**tf.to_int32(i)) + res = sum(x_labels) + return tf.to_int32(tf.reshape(res, common_layers.shape_list(x_bit)[:-1])) + + +def int_to_bit(x_int, num_bits, base=2): + """Turn x_int representing numbers into a bitwise (lower-endian) tensor. + + Args: + x_int: Tensor containing integer to be converted into base notation. + num_bits: Number of bits in the representation. + base: Base of the representation. + + Returns: + Corresponding number expressed in base. + """ + x_l = tf.to_int32(tf.expand_dims(x_int, axis=-1)) + x_labels = [] + for i in range(num_bits): + x_labels.append( + tf.floormod( + tf.floordiv(tf.to_int32(x_l), + tf.to_int32(base)**i), tf.to_int32(base))) + res = tf.concat(x_labels, axis=-1) + return tf.to_float(res) + + +def embed(x, + hidden_size, + z_size, + filter_size, + name, + bottleneck_kind='dvq', + num_blocks=2, + block_v_size=None, + means=None): + """Embedding function that takes discrete latent and returns embedding. + + Args: + x: Input to the discretization bottleneck. + hidden_size: Dimension of the latent state. + z_size: Number of bits used to produce discrete code; discrete codes range + from 1 to 2**z_size. + filter_size: Filter size to be used for the embedding function. + name: Name for the bottleneck scope. + bottleneck_kind: Kind of discretization bottleneck to use; one of dvq, + semhash, gumbel-softmax. + num_blocks: Number of blocks in DVQ. + block_v_size: Number of embedding entries per block. + means: The embedding table for dvq (Default: None). + + Returns: + Continuous embedding to be passed on to the decoder. + + Raises: + ValueError: For unknown or missing arguments. + """ + with tf.variable_scope(name, reuse=tf.AUTO_REUSE): + if bottleneck_kind == 'semhash': + c = int_to_bit(x, z_size) + h1a = tf.layers.dense(c, filter_size, name='vch1a') + h1b = tf.layers.dense(1.0 - c, filter_size, name='vch1b') + h1 = h1a + h1b + elif bottleneck_kind == 'gumbel-softmax': + hot = tf.one_hot(x, 2**z_size) + h1 = tf.layers.dense(hot, hidden_size, name='dae_dense') + elif bottleneck_kind == 'dvq': + if block_v_size is None: + raise ValueError('Bottleneck kind is dvq but block_v_size is None.') + + shape_x = common_layers.shape_list(x) + x_flat = tf.reshape(x, [-1, 1]) + c = int_to_bit(x_flat, num_bits=z_size, base=2) + shape = common_layers.shape_list(c) + new_shape = shape + new_shape[-1] = num_blocks + new_shape.append(int(z_size / num_blocks)) + c = tf.to_int32(tf.reshape(c, shape=new_shape)) + c = bit_to_int(c, num_bits=int(z_size / num_blocks), base=2) + c_hot = tf.one_hot(c, depth=block_v_size, axis=-1) + c_hot_flat = tf.reshape(c_hot, shape=[-1, num_blocks, block_v_size]) + h1 = tf.matmul(tf.transpose(c_hot_flat, perm=[1, 0, 2]), means) + h1 = tf.transpose(h1, perm=[1, 0, 2]) + new_shape = shape_x + new_shape.append(hidden_size) + h1 = tf.reshape(h1, new_shape) + elif bottleneck_kind == 'rounding': + h1 = x + else: + raise ValueError('Unknown bottleneck kind.') + + h2 = tf.layers.dense(tf.nn.relu(h1), filter_size, name='vch2') + return tf.layers.dense(tf.nn.relu(h2), hidden_size, name='vcfin') + + +def vae(x, name, z_size): + """Simple variational autoencoder without discretization. + + Args: + x: Input to the discretization bottleneck. + name: Name for the bottleneck scope. + z_size: Number of bits used to produce discrete code; discrete codes range + from 1 to 2**z_size. + + Returns: + Embedding function, latent, loss, mu and log_simga. + """ + with tf.variable_scope(name): + mu = tf.layers.dense(x, z_size, name='mu') + log_sigma = tf.layers.dense(x, z_size, name='log_sigma') + shape = common_layers.shape_list(x) + epsilon = tf.random_normal([shape[0], shape[1], 1, z_size]) + z = mu + tf.exp(log_sigma / 2) * epsilon + kl = 0.5 * tf.reduce_mean( + tf.exp(log_sigma) + tf.square(mu) - 1. - log_sigma, axis=-1) + free_bits = z_size // 4 + kl_loss = tf.reduce_mean(tf.maximum(kl - free_bits, 0.0)) + return z, kl_loss, mu, log_sigma + + +def top_k_softmax(x, k): + """Calculate softmax(x), select top-k and rescale to sum to 1. + + Args: + x: Input to softmax over. + k: Number of top-k to select. + + Returns: + softmax(x) and maximum item. + """ + x = tf.nn.softmax(x) + top_x, _ = tf.nn.top_k(x, k=k + 1) + min_top = tf.reduce_min(top_x, axis=-1, keep_dims=True) + x = tf.nn.relu((x - min_top) + 1e-12) + x /= tf.reduce_sum(x, axis=-1, keep_dims=True) + return x, tf.reduce_max(top_x, axis=-1) + + +def gumbel_sample(shape): + """Sample from the Gumbel distribution, protect from overflows. + + Args: + shape: Shape of Gumbel samples. + + Returns: + Noise drawn from Gumbel distribution. + """ + uniform_samples = tf.random_uniform(shape, minval=0.00001, maxval=0.99998) + return -tf.log(-tf.log(uniform_samples)) + + +def gumbel_softmax(x, + name, + z_size, + mode, + softmax_k=0, + kl_warmup_steps=150000, + summary=True): + """Gumbel softmax discretization bottleneck. + + Args: + x: Input to the discretization bottleneck. + name: Name for the bottleneck scope. + z_size: Number of bits used to produce discrete code; discrete codes range + from 1 to 2**z_size. + mode: Mode represents whether we are training or testing for bottlenecks + that differ in behavior (Default: None). + softmax_k: If > 1 then do top-k softmax (Default: 0). + kl_warmup_steps: Number of steps for kl warmup (Default: 150000). + summary: If True, then write summaries (Default: True). + + Returns: + Embedding function, discrete code and loss. + """ + with tf.variable_scope(name): + m = tf.layers.dense(x, 2**z_size, name='mask') + if softmax_k > 0: + m, kl = top_k_softmax(m, softmax_k) + return m, m, 1.0 - tf.reduce_mean(kl) + logsm = tf.nn.log_softmax(m) + + # Gumbel-softmax sample. + gumbel_samples = gumbel_sample(common_layers.shape_list(m)) + steps = kl_warmup_steps + gumbel_samples *= common_layers.inverse_exp_decay(steps // 5) * 0.5 + temperature = 1.2 - common_layers.inverse_lin_decay(steps) + + # 10% of the time keep reasonably high temperature to keep learning. + temperature = tf.cond( + tf.less(tf.random_uniform([]), 0.9), lambda: temperature, + lambda: tf.random_uniform([], minval=0.5, maxval=1.0)) + s = tf.nn.softmax((logsm + gumbel_samples) / temperature) + m = tf.nn.softmax(m) + kl = -tf.reduce_max(logsm, axis=-1) + + if summary: + tf.summary.histogram('max-log', tf.reshape(kl, [-1])) + + # Calculate the argmax and construct hot vectors. + maxvec = tf.reshape(tf.argmax(m, axis=-1), [-1]) + maxvhot = tf.stop_gradient(tf.one_hot(maxvec, 2**z_size)) + + # Add losses that prevent too few being used. + distrib = tf.reshape(logsm, [-1, 2**z_size]) * maxvhot + d_mean = tf.reduce_mean(distrib, axis=[0], keep_dims=True) + d_variance = tf.reduce_mean(tf.square(distrib - d_mean), axis=[0]) + d_dev = -tf.reduce_mean(d_variance) + ret = s + + if mode != tf.contrib.learn.ModeKeys.TRAIN: + ret = tf.reshape(maxvhot, common_layers.shape_list(s)) # Just hot @eval. + return m, ret, d_dev * 5.0 + tf.reduce_mean(kl) * 0.002 + + +def discrete_bottleneck(x, + hidden_size, + z_size, + filter_size, + name, + mode=None, + startup_steps=50000, + bottleneck_kind='dvq', + num_blocks=2, + reshape_method='slice', + projection_tensors=None, + means=None, + beta=0.25, + noise_dev=1., + decay=0.999, + discrete_mix=0.5, + random_top_k=1, + epsilon=1e-5, + softmax_k=0, + kl_warmup_steps=150000, + ema=True, + ema_count=None, + ema_means=None, + summary=True, + dp_strength=1.0, + dp_decay=1.0, + dp_alpha=0.5): + """Discretization bottleneck for latent variables. + + Args: + x: Input to the discretization bottleneck. + hidden_size: Dimension of the latent state. + z_size: Number of bits used to produce discrete code; discrete codes range + from 1 to 2**z_size. + filter_size: Filter size to be used for the embedding function. + name: Name for the bottleneck scope. + mode: Mode represents whether we are training or testing for bottlenecks + that differ in behavior (Default: None). + startup_steps: Number of steps after which latent predictor is trained + (Default: 50000). + bottleneck_kind: Kind of discretization bottleneck to use; one of dvq, + semhash, gumbel-softmax (Default: dvq). + num_blocks: Number of blocks to use for decomposed vector quantization. + reshape_method: Method to reshape for DVQ (Default: slice). + projection_tensors: If the reshape method is project, then these are the + tensors used to project (Default: None). + means: The embedding table for dvq (Default: None). + beta: Beta factor for the DVQ loss (Default: 0.25). + noise_dev: Stddev for noise added for semhash (Default: 0). + decay: Decay factor for the exponential moving average (Default: 0.999). + discrete_mix: Factor for mixing discrete and non-discrete input for semhash + (Default: 0.5). + random_top_k: Noisy top-k for DVQ (Default: 1). + epsilon: Epsilon parameter for DVQ (Default: 1e-5). + softmax_k: If > 1 then do top-k softmax (Default: 0). + kl_warmup_steps: Number of steps for kl warmup (Default: 150000). + ema: If True update embeddings using exponential moving averages (Default: + True). + ema_count: Table of counts for each embedding corresponding to how many + examples in a batch it was the closest to (Default: None). + ema_means: Exponentially averaged version of the embeddings (Default: None). + summary: If True, then write summaries (Default: True). + dp_strength: Strength of Dirichlet Process loss prior (Default: 1.0). + dp_decay: Decay the dp_strength using an exponential decay using this + term (Default: 1.0). + dp_alpha: Alpha term (pseudo-count) in Dirichlet Process (Default: 0.5). + + Returns: + Embedding to pass to the decoder, discrete latent, loss, and the embedding + function. + + Raises: + ValueError: If projection_tensors is None for reshape_method project, or + ema_count or ema_means is None if we are using ema, or unknown args. + """ + block_v_size = None + if bottleneck_kind == 'dvq': + # Define the dvq parameters + assert means is not None + + # Check block dimensions add up + if hidden_size % num_blocks != 0: + raise ValueError('num_blocks does not divide hidden size') + + if 2**z_size % num_blocks != 0: + raise ValueError('num_blocks does not divide embedding table size') + + block_v_size = 2**(z_size / num_blocks) + block_v_size = int(block_v_size) + + # Set the reshape method corresponding to projections or slices + if reshape_method == 'slice': + reshape_fn = partial( + slice_hidden, hidden_size=hidden_size, num_blocks=num_blocks) + elif reshape_method == 'project': + if projection_tensors is None: + raise ValueError( + 'Projection tensors is None for reshape_method project') + reshape_fn = partial( + project_hidden, + projection_tensors=projection_tensors, + hidden_size=hidden_size, + num_blocks=num_blocks) + else: + raise ValueError('Unknown reshape_method') + + # Check if the ema settings make sense + if ema: + if ema_count is None: + raise ValueError('ema_count is None but ema is True') + if ema_means is None: + raise ValueError('ema_means is None but ema is True') + + with tf.variable_scope(name, reuse=tf.AUTO_REUSE): + l = tf.constant(0.0) + if bottleneck_kind == 'dense': + c = tf.layers.dense(x, z_size, name='vcc') + h1 = tf.layers.dense(c, filter_size, name='vch1') + elif bottleneck_kind == 'vae': + c, l, _, _ = vae(x, z_size, 'vae') + h1 = tf.layers.dense(c, filter_size, name='vch1') + elif bottleneck_kind == 'semhash': + c = tf.layers.dense(x, z_size, name='vcc') + y_clean = common_layers.saturating_sigmoid(c) + if summary: + tf.summary.histogram('y_clean', tf.reshape(y_clean, [-1])) + if noise_dev > 0 and mode == tf.estimator.ModeKeys.TRAIN: + noise = tf.truncated_normal( + common_layers.shape_list(c), mean=0.0, stddev=noise_dev) + y = common_layers.saturating_sigmoid(c + noise) + else: + y = y_clean + d = tf.to_float(tf.less(0.5, y)) + y_discrete = tf.stop_gradient(d) + y - tf.stop_gradient(y) + pd = common_layers.inverse_exp_decay(startup_steps * 2) + pd *= discrete_mix + pd = pd if mode == tf.estimator.ModeKeys.TRAIN else 1.0 + c = tf.where( + tf.less(tf.random_uniform([common_layers.shape_list(y)[0]]), pd), + y_discrete, y) + h1a = tf.layers.dense(c, filter_size, name='vch1a') + h1b = tf.layers.dense(1.0 - c, filter_size, name='vch1b') + h1 = h1a + h1b + dx = tf.to_int32(tf.stop_gradient(d)) + c = bit_to_int(dx, z_size) + elif bottleneck_kind == 'gumbel-softmax': + _, hot, l = gumbel_softmax(x, name, z_size, mode, softmax_k, + kl_warmup_steps, summary) + c = tf.argmax(hot, axis=-1) + h1 = tf.layers.dense(hot, hidden_size, name='dae_dense') + elif bottleneck_kind == 'dvq': + x_reshaped = reshape_fn(x) + x_means_hot, x_means, q_loss, e_loss = embedding_lookup( + x_reshaped, means, num_blocks, block_v_size, random_top_k) + + # Get the discrete latent represenation + x_means_idx = tf.argmax(x_means_hot, axis=-1) + + # Get the binary representation + x_means_bits = int_to_bit( + x_means_idx, num_bits=int(z_size / num_blocks), base=2) + shape = common_layers.shape_list(x_means_bits) + new_shape = shape[:-1] + new_shape[-1] = z_size + x_means_bits = tf.reshape(x_means_bits, shape=new_shape) + c = bit_to_int(tf.to_int32(x_means_bits), num_bits=z_size, base=2) + + # Adjust shape of c + shape_x = common_layers.shape_list(x) + new_shape = shape_x[:-1] + c = tf.reshape(c, new_shape) + + # Update the ema variables + if ema: + tf.logging.info('Using EMA with beta = {}'.format(beta)) + updated_ema_count = moving_averages.assign_moving_average( + ema_count, + tf.reduce_sum( + tf.reshape(x_means_hot, shape=[-1, num_blocks, block_v_size]), + axis=0), + decay, + zero_debias=False) + + # Adding a term that puts a Dirichlet prior over cluster probabilities + # Hopefully it'll encourage rich get richer behaviors + dp_prior_loss = 0. + if dp_strength > 0.0: + # Decay dp_strength over time to make it less important + dp_strength = tf.train.exponential_decay( + dp_strength, + global_step=tf.to_int32(tf.train.get_global_step()), + decay_steps=20000, + decay_rate=dp_decay) + dp_count = ema_count + dp_alpha + p = dp_count / tf.reduce_sum(dp_count, 1, keepdims=True) + dp_prior_loss = tf.log(p) + dp_prior_loss = -1.0 * tf.reduce_sum(dp_prior_loss) + dp_prior_loss /= (num_blocks * block_v_size) + + x_means_hot_flat = tf.reshape( + x_means_hot, shape=[-1, num_blocks, block_v_size]) + dw = tf.matmul( + tf.transpose(x_means_hot_flat, perm=[1, 2, 0]), + tf.transpose(x_reshaped, perm=[1, 0, 2])) + updated_ema_means = moving_averages.assign_moving_average( + ema_means, dw, decay, zero_debias=False) + n = tf.reduce_sum(updated_ema_count, axis=-1, keep_dims=True) + updated_ema_count = ((updated_ema_count + epsilon) / + (n + 2**z_size * epsilon) * n) + updated_ema_means /= tf.expand_dims(updated_ema_count, axis=-1) + + with tf.control_dependencies([e_loss]): + update_means = tf.assign(means, updated_ema_means) + with tf.control_dependencies([update_means]): + l = beta * e_loss + dp_strength * dp_prior_loss + else: + l = q_loss + beta * e_loss + + x_means = tf.reshape(x_means, shape_x) + x_reshaped = tf.reshape(x_reshaped, shape_x) + h1 = x_reshaped + tf.stop_gradient(x_means - x_reshaped) + else: + raise ValueError('Unknown discretization method.') + + h2 = tf.layers.dense(tf.nn.relu(h1), filter_size, name='vch2') + res = tf.layers.dense(tf.nn.relu(h2), hidden_size, name='vcfin') + + embed_fn = partial( + embed, + hidden_size=hidden_size, + z_size=z_size, + filter_size=filter_size, + name=name, + bottleneck_kind=bottleneck_kind, + num_blocks=num_blocks, + block_v_size=block_v_size, + means=means) + return res, c, l, embed_fn diff --git a/tensor2tensor/layers/discretization_test.py b/tensor2tensor/layers/discretization_test.py new file mode 100644 index 000000000..74eb3d6fb --- /dev/null +++ b/tensor2tensor/layers/discretization_test.py @@ -0,0 +1,125 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensor2tensor.layers.discretization.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +# Dependency imports +import numpy as np +from tensor2tensor.layers import discretization +import tensorflow as tf + + +class DiscretizationTest(tf.test.TestCase): + + def setUp(self): + tf.set_random_seed(1234) + np.random.seed(123) + + def testBitToIntZeros(self): + x_bit = tf.zeros(shape=[1, 10], dtype=tf.float32) + x_int = tf.zeros(shape=[1], dtype=tf.int32) + diff = discretization.bit_to_int(x_bit, num_bits=10) - x_int + with self.test_session() as sess: + tf.global_variables_initializer().run() + d = sess.run(diff) + self.assertEqual(d, 0) + + def testBitToIntOnes(self): + x_bit = tf.ones(shape=[1, 3], dtype=tf.float32) + x_int = 7 * tf.ones(shape=[1], dtype=tf.int32) + diff = discretization.bit_to_int(x_bit, num_bits=3) - x_int + with self.test_session() as sess: + tf.global_variables_initializer().run() + d = sess.run(diff) + self.assertEqual(d, 0) + + def testIntToBitZeros(self): + x_bit = tf.zeros(shape=[1, 10], dtype=tf.float32) + x_int = tf.zeros(shape=[1], dtype=tf.int32) + diff = discretization.int_to_bit(x_int, num_bits=10) - x_bit + with self.test_session() as sess: + tf.global_variables_initializer().run() + d = sess.run(diff) + self.assertTrue(np.all(d == 0)) + + def testIntToBitOnes(self): + x_bit = tf.ones(shape=[1, 3], dtype=tf.float32) + x_int = 7 * tf.ones(shape=[1], dtype=tf.int32) + diff = discretization.int_to_bit(x_int, num_bits=3) - x_bit + with self.test_session() as sess: + tf.global_variables_initializer().run() + d = sess.run(diff) + self.assertTrue(np.all(d == 0)) + + def testProjectHidden(self): + hidden_size = 60 + block_dim = 20 + num_blocks = 3 + x = tf.zeros(shape=[1, hidden_size], dtype=tf.float32) + projection_tensors = tf.random_normal( + shape=[num_blocks, hidden_size, block_dim], dtype=tf.float32) + x_projected = discretization.project_hidden(x, projection_tensors, + hidden_size, num_blocks) + with self.test_session() as sess: + tf.global_variables_initializer().run() + x_projected_eval = sess.run(x_projected) + self.assertEqual(np.shape(x_projected_eval), (1, num_blocks, block_dim)) + self.assertTrue(np.all(x_projected_eval == 0)) + + def testSliceHiddenZeros(self): + hidden_size = 60 + block_dim = 20 + num_blocks = 3 + x = tf.zeros(shape=[1, hidden_size], dtype=tf.float32) + x_sliced = discretization.slice_hidden(x, hidden_size, num_blocks) + with self.test_session() as sess: + tf.global_variables_initializer().run() + x_sliced_eval = sess.run(x_sliced) + self.assertEqual(np.shape(x_sliced_eval), (1, num_blocks, block_dim)) + self.assertTrue(np.all(x_sliced_eval == 0)) + + def testSliceHiddenOnes(self): + hidden_size = 60 + block_dim = 20 + num_blocks = 3 + x = tf.ones(shape=[1, hidden_size], dtype=tf.float32) + x_sliced = discretization.slice_hidden(x, hidden_size, num_blocks) + with self.test_session() as sess: + tf.global_variables_initializer().run() + x_sliced_eval = sess.run(x_sliced) + self.assertEqual(np.shape(x_sliced_eval), (1, num_blocks, block_dim)) + self.assertTrue(np.all(x_sliced_eval == 1)) + + def testNearestNeighbors(self): + x = tf.constant([[0, 0.9, 0], [0.8, 0., 0.]], dtype=tf.float32) + x = tf.expand_dims(x, axis=0) + means = tf.constant( + [[1, 0, 0], [0, 1, 0], [0, 0, 1], [9, 9, 9]], dtype=tf.float32) + means = tf.stack([means, means], axis=0) + x_means_hot = discretization.nearest_neighbor(x, means, block_v_size=4) + x_means_hot_test = np.array([[0, 1, 0, 0], [1, 0, 0, 0]]) + x_means_hot_test = np.expand_dims(x_means_hot_test, axis=0) + with self.test_session() as sess: + tf.global_variables_initializer().run() + x_means_hot_eval = sess.run(x_means_hot) + self.assertEqual(np.shape(x_means_hot_eval), (1, 2, 4)) + self.assertTrue(np.all(x_means_hot_eval == x_means_hot_test)) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py index 3c54fa339..478e3284f 100644 --- a/tensor2tensor/layers/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -542,3 +542,8 @@ def top(self, body_output, _): def targets_bottom(self, x): """SymbolModality overrides targets_bottom, so need to override here too.""" return self.bottom(x) + + @property + def top_is_pointwise(self): + # pointwise mode manipulates body output, not logits, so it fails here. + return False diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py index df7744ff9..c78d1f52a 100644 --- a/tensor2tensor/models/__init__.py +++ b/tensor2tensor/models/__init__.py @@ -37,6 +37,7 @@ from tensor2tensor.models import vanilla_gan from tensor2tensor.models import xception +from tensor2tensor.models.research import adafactor_experiments from tensor2tensor.models.research import aligned from tensor2tensor.models.research import attention_lm from tensor2tensor.models.research import attention_lm_moe @@ -47,5 +48,6 @@ from tensor2tensor.models.research import transformer_moe from tensor2tensor.models.research import transformer_revnet from tensor2tensor.models.research import transformer_sketch +from tensor2tensor.models.research import transformer_symshard from tensor2tensor.models.research import transformer_vae # pylint: enable=unused-import diff --git a/tensor2tensor/models/image_transformer_2d.py b/tensor2tensor/models/image_transformer_2d.py index 5ab0d112b..046fa06ee 100644 --- a/tensor2tensor/models/image_transformer_2d.py +++ b/tensor2tensor/models/image_transformer_2d.py @@ -424,6 +424,35 @@ def imagetransformer2d_tiny(): return hparams +def update_hparams_for_tpu(hparams): + hparams.use_pad_remover = False # where op not supported + hparams.optimizer = "TrueAdam" + hparams.batch_size = 4 + + +@registry.register_hparams +def img2mg_transformer_base_tpu(): + """Hparams for training img2img_transformer on tpu.""" + hparams = img2img_transformer_base() + update_hparams_for_tpu(hparams) + hparams.batch_size = 4 + hparams.num_heads = 4 # heads are expensive on tpu + hparams.num_decoder_layers = 8 + hparams.num_encoder_layers = 4 + hparams.shared_embedding_and_softmax_weights = False + return hparams + + +@registry.register_hparams +def img2mg_transformer_tiny_tpu(): + hparams = img2mg_transformer_base_tpu() + hparams.num_hidden_layers = 2 + hparams.hidden_size = 16 + hparams.batch_size = 2 + hparams.num_heads = 2 + return hparams + + @registry.register_hparams def img2img_transformer2d_n3(): hparams = img2img_transformer2d_base() diff --git a/tensor2tensor/models/research/adafactor_experiments.py b/tensor2tensor/models/research/adafactor_experiments.py new file mode 100644 index 000000000..d7031dee2 --- /dev/null +++ b/tensor2tensor/models/research/adafactor_experiments.py @@ -0,0 +1,173 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Experiments with Adafactor. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensor2tensor.models import transformer +from tensor2tensor.utils import registry + + +def mimic_adam_with_adafactor(hparams): + """Switch from Adam to Adafactor, approximating the behavior of Adam. + + Some minor things may be different, like epsilon and beta1 correction. + + Args: + hparams: model hyperparameters where "Adam" in hparams.optimizer + """ + assert "Adam" in hparams.optimizer + hparams.optimizer = "Adafactor" + hparams.optimizer_adafactor_beta1 = hparams.optimizer_adam_beta1 + hparams.optimizer_adafactor_beta2 = hparams.optimizer_adam_beta2 + hparams.optimizer_adafactor_multiply_by_parameter_scale = False + hparams.optimizer_adafactor_factored = False + hparams.optimizer_adafactor_clipping_threshold = None + hparams.optimizer_adafactor_decay_type = "Adam" + + +@registry.register_hparams +def afx_adam(): + """Old version - Adam.""" + hparams = transformer.transformer_base_v2() + hparams.optimizer_adam_beta1 = 0.9 + hparams.optimizer_adam_beta2 = 0.999 + hparams.symbol_modality_num_shards = 1 + hparams.batch_size = 2048 + hparams.optimizer = "Adam" + hparams.learning_rate_schedule = ( + "constant*rsqrt_decay*linear_warmup*rsqrt_hidden_size") + hparams.learning_rate_constant = 2.0 + return hparams + + +@registry.register_hparams +def afx_mimic_adam(): + """Emulating Adam - should be very similar to afx_adam.""" + hparams = afx_adam() + mimic_adam_with_adafactor(hparams) + return hparams + + +@registry.register_hparams +def afx_base(): + """Baseline - no momentum, beta=0.999.""" + hparams = afx_mimic_adam() + hparams.optimizer_adafactor_beta1 = 0.0 + return hparams + + +@registry.register_hparams +def afx_factored(): + hparams = afx_base() + hparams.optimizer_adafactor_factored = True + return hparams + + +@registry.register_hparams +def afx_fast(): + hparams = afx_base() + hparams.optimizer_adafactor_beta2 = 0.9 + return hparams + + +@registry.register_hparams +def afx_clip(): + hparams = afx_base() + hparams.optimizer_adafactor_clipping_threshold = 1.0 + return hparams + + +@registry.register_hparams +def afx_clip2(): + hparams = afx_base() + hparams.optimizer_adafactor_clipping_threshold = 2.0 + return hparams + + +@registry.register_hparams +def afx_clip_factored(): + hparams = afx_clip() + hparams.optimizer_adafactor_factored = True + return hparams + + +@registry.register_hparams +def afx_pow05(): + hparams = afx_base() + hparams.optimizer_adafactor_decay_type = "pow" + hparams.optimizer_adafactor_memory_exponent = 0.5 + return hparams + + +@registry.register_hparams +def afx_pow08(): + hparams = afx_pow05() + hparams.optimizer_adafactor_memory_exponent = 0.8 + return hparams + + +@registry.register_hparams +def afx_pow10(): + hparams = afx_pow05() + hparams.optimizer_adafactor_memory_exponent = 1.0 + return hparams + + +@registry.register_hparams +def afx_pow08_clip(): + hparams = afx_pow08() + hparams.optimizer_adafactor_clipping_threshold = 1.0 + return hparams + + +@registry.register_hparams +def afx_relative(): + hparams = afx_base() + hparams.optimizer_adafactor_multiply_by_parameter_scale = True + hparams.learning_rate_schedule = "rsqrt_decay" + hparams.learning_rate_warmup_steps = 10000 + return hparams + + +@registry.register_hparams +def afx_unscale(): + hparams = afx_base() + hparams.shared_embedding_and_softmax_weights = False + hparams.multiply_embedding_mode = "none" + return hparams + + +@registry.register_hparams +def afx_unscale_relative(): + hparams = afx_unscale() + hparams.optimizer_adafactor_multiply_by_parameter_scale = True + hparams.learning_rate_schedule = "rsqrt_decay" + hparams.learning_rate_warmup_steps = 10000 + return hparams + + +@registry.register_hparams +def afx_adafactor(): + """Adafactor with recommended learning rate schedule.""" + hparams = afx_adam() + hparams.optimizer = "Adafactor" + hparams.learning_rate_schedule = "rsqrt_decay" + hparams.learning_rate_warmup_steps = 10000 + return hparams diff --git a/tensor2tensor/models/research/transformer_symshard.py b/tensor2tensor/models/research/transformer_symshard.py new file mode 100644 index 000000000..64b9fed97 --- /dev/null +++ b/tensor2tensor/models/research/transformer_symshard.py @@ -0,0 +1,416 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test of the SymShard programming model. + +Symmetric model parallellism. + +Each shard (device) has a similar structure with different weights. +Occasional allreduce (sum) across shards. + +On TPU, we replicate the whole model on each core. This is not the intended +use, but we can test the model quality. + +Example problem: translate_ende_8k_packed + +Preliminary results on languagemodel_lm1b8k_packed (200k steps 8 cores) + transformer_tpu: 48M params dev-log-ppl=-1.29 dev-BLEU=27.0 + transformer_symshard_sh4: 49M params dev-log-ppl=-1.30 dev-BLEU=26.4 + transformer_symshard_base: 98M params dev-log-ppl=-1.23 dev-BLEU=27.6 + + transformer_symshard_base with different mixing fraction (default=0.5): + mix_fraction=0.0 dev-log-ppl=-1.33 + mix_fraction=0.25 dev-log-ppl=-1.23 + mix_fraction=0.5 dev-log-ppl=-1.23 + mix_fraction=0.75 dev-log-ppl=-1.24 + mix_fraction=1.0 dev-log-ppl=-1.28 + +TODO(noam): Make sure no one is using super_lm, then delete it. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers +from tensor2tensor.utils import expert_utils +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +@registry.register_model +class TransformerSymshard(t2t_model.T2TModel): + """See file docstring.""" + + def body(self, features): + hparams = self._hparams + ps_devices = self._ps_devices + single_device = (len(ps_devices) == 1) + assert hparams.num_model_shards % len(ps_devices) == 0 + shards_per_device = hparams.num_model_shards // len(ps_devices) + model_devices = [ps_devices[i // shards_per_device] + for i in xrange(hparams.num_model_shards)] + print("model_devices = %s" % model_devices) + mp = expert_utils.Parallelism(model_devices, reuse=False) + targets_vocab_size = self._problem_hparams.vocabulary["targets"].vocab_size + # squeeze out channels, heights + targets = tf.squeeze(features["targets_raw"], [2, 3]) + targets_embedding_var = mp( + tf.get_variable, "embedding", + [[targets_vocab_size, hparams.hidden_size]] * mp.n, + initializer=tf.random_normal_initializer( + 0.0, hparams.hidden_size**-0.5)) + shifted_targets = common_layers.shift_right_2d(targets) + # Bypass the symbol modality and use a different embedding on each shard. + if single_device: + targets_embedding_var_combined = tf.concat(targets_embedding_var, 1) + decoder_input_combined = common_layers.embedding( + shifted_targets, targets_vocab_size, + hparams.hidden_size * mp.n, + multiplier=hparams.hidden_size**0.5, + embedding_var=targets_embedding_var_combined, + ) + decoder_input = tf.split(decoder_input_combined, mp.n, axis=2) + else: + targets_embedding_var_combined = None + decoder_input = mp( + common_layers.embedding, shifted_targets, targets_vocab_size, + hparams.hidden_size, + multiplier=hparams.hidden_size**0.5, + embedding_var=targets_embedding_var, + ) + decoder_self_attention_bias = mp( + common_attention.attention_bias_lower_triangle, + tf.shape(targets)[1]) + if "targets_segmentation" in features: + # "Packed" dataset - keep the examples from seeing each other. + targets_segmentation = features["targets_segmentation"] + targets_position = features["targets_position"] + decoder_self_attention_bias = mp( + tf.add, decoder_self_attention_bias, + mp(common_attention.attention_bias_same_segment, + targets_segmentation, targets_segmentation)) + decoder_input = mp( + common_attention.add_timing_signal_1d_given_position, + decoder_input, targets_position) + else: + targets_position = None + decoder_self_attention_bias = mp( + common_attention.attention_bias_lower_triangle, + tf.shape(targets)[1]) + decoder_input = mp(common_attention.add_timing_signal_1d, decoder_input) + + if self.has_input: + inputs = tf.squeeze(features["inputs_raw"], [2, 3]) + inputs_vocab_size = self._problem_hparams.vocabulary["inputs"].vocab_size + # share everything for now + share_inputs_and_targets_embedding = True + if share_inputs_and_targets_embedding: + assert inputs_vocab_size == targets_vocab_size + inputs_embedding_var = targets_embedding_var + inputs_embedding_var_combined = targets_embedding_var_combined + if single_device: + encoder_input_combined = common_layers.embedding( + inputs, inputs_vocab_size, + hparams.hidden_size * mp.n, + multiplier=hparams.hidden_size**0.5, + embedding_var=inputs_embedding_var_combined, + ) + encoder_input = tf.split(encoder_input_combined, mp.n, axis=2) + else: + encoder_input = mp( + common_layers.embedding, inputs, inputs_vocab_size, + hparams.hidden_size, + multiplier=hparams.hidden_size**0.5, + embedding_var=inputs_embedding_var, + ) + if "inputs_segmentation" in features: + # "Packed" dataset - keep the examples from seeing each other. + inputs_segmentation = features["inputs_segmentation"] + inputs_position = features["inputs_position"] + encoder_self_attention_bias = mp( + common_attention.attention_bias_same_segment, + inputs_segmentation, inputs_segmentation) + encoder_decoder_attention_bias = mp( + common_attention.attention_bias_same_segment, + targets_segmentation, inputs_segmentation) + encoder_input = mp( + common_attention.add_timing_signal_1d_given_position, + encoder_input, inputs_position) + else: + encoder_padding = tf.to_float(tf.equal(inputs, 0)) + ignore_padding = common_attention.attention_bias_ignore_padding( + encoder_padding) + encoder_self_attention_bias = ignore_padding + encoder_decoder_attention_bias = ignore_padding + inputs_position = None + encoder_input = mp(common_attention.add_timing_signal_1d, encoder_input) + + # encoder stack here + with tf.variable_scope("encoder"): + encoder_input = mp( + tf.nn.dropout, encoder_input, + 1.0 - hparams.layer_prepostprocess_dropout) + encoder_output = _layer_stack( + mp, + encoder_input, + encoder_self_attention_bias, + hparams.encoder_layers, + hparams) + else: + encoder_decoder_attention_bias = None + encoder_output = None + + with tf.variable_scope("decoder"): + decoder_input = mp( + tf.nn.dropout, decoder_input, + 1.0 - hparams.layer_prepostprocess_dropout) + decoder_output = _layer_stack( + mp, + decoder_input, + decoder_self_attention_bias, + layers=hparams.decoder_layers, + hparams=hparams, + encoder_output=encoder_output, + encoder_decoder_attention_bias=encoder_decoder_attention_bias) + + # Bypass the symbol modality and compute logits directly. + # We compute a different set of logits on each shard, and sum them. + # Share the weights with the target embedding. + output_var = targets_embedding_var + output_var_combined = targets_embedding_var_combined + if single_device: + decoder_output = tf.concat(decoder_output, 2) + logits = tf.tensordot(decoder_output, output_var_combined, [[2], [1]]) + num, denom = common_layers.padded_cross_entropy( + logits, targets, hparams.label_smoothing) + training_loss = num / denom + else: + logits = mp( + tf.tensordot, decoder_output, output_var, [[[2], [1]]] * mp.n) + logits = common_layers.all_reduce_ring(logits, mp) + # On each device, we compute the loss for a part of the batch. + # This is faster than computing the whole loss on one shard. + mp, logits = common_layers.reduce_by_device(mp, logits, lambda l: l[0]) + def _loss_for_shard(logits, targets, shard): + logits = common_layers.approximate_split(logits, mp.n, 0)[shard] + targets = common_layers.approximate_split(targets, mp.n, 0)[shard] + return common_layers.padded_cross_entropy( + logits, targets, hparams.label_smoothing) + num, denom = mp(_loss_for_shard, logits, targets, range(mp.n)) + training_loss = tf.add_n(num) / tf.add_n(denom) + logits = logits[0] + logits = tf.expand_dims(tf.expand_dims(logits, 2), 3) + # override training loss so that it is not computed externally. + losses = {"training": training_loss} + return logits, losses + + +def _layer_stack(mp, + inputs, + self_attention_bias, + layers, + hparams, + encoder_output=None, + encoder_decoder_attention_bias=None): + """A stack of layers. + + Args: + mp: a Parallelism object + inputs: a list of Tensors + self_attention_bias: list of bias Tensor for self-attention + (see common_attention.attention_bias()) + layers: a string + hparams: hyperparameters for model + encoder_output: optional list of tensors + encoder_decoder_attention_bias: optional list of tensors + + Returns: + y: a list of Tensors + """ + layers = layers.strip(",").split(",") + + # scaled_dot_product_attention_with_projections uses a 3d attention bias + # (no heads), where multihead_attention uses 4d attention bias. + self_attention_bias_3d = mp(tf.squeeze, self_attention_bias, 1) + if encoder_decoder_attention_bias is not None: + encoder_decoder_attention_bias_3d = mp( + tf.squeeze, encoder_decoder_attention_bias, 1) + relu_dropout_broadcast_dims = ( + common_layers.comma_separated_string_to_integer_list( + getattr(hparams, "relu_dropout_broadcast_dims", ""))) + mix_size = int(hparams.mix_fraction * hparams.hidden_size) + accumulator = inputs + x = inputs + for layer_num, layer_type in enumerate(layers): + with tf.variable_scope("%s_%d" % (layer_type, layer_num)): + tf.logging.info("%s_%d" % (layer_type, layer_num)) + if layer_type == "a": + # accumulate + accumulator = mp(tf.add, x, accumulator) + x = accumulator + elif layer_type == "n": + # normalize + x = mp(common_layers.apply_norm, + x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon) + elif layer_type == "d": + # dropout + x = mp(tf.nn.dropout, x, 1.0 - hparams.layer_prepostprocess_dropout) + elif layer_type == "m": + if mix_size > 0: + # mix across shards + def _split(t): + return tuple(tf.split( + t, [mix_size, hparams.hidden_size - mix_size], 2)) + to_mix, to_keep = mp(_split, x) + mixed = common_layers.all_reduce_ring(to_mix, mp) + mixed = mp(tf.multiply, mixed, mp.n ** -0.5) + x = mp(lambda a, b: tf.concat([a, b], 2), mixed, to_keep) + elif layer_type == "att": + # single-head attention + q = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, + name="q_transform") + x = mp( + common_attention.scaled_dot_product_attention_simple, + q, x, x, self_attention_bias_3d) + x = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, + name="o_transform") + elif layer_type == "enc-att": + # single-head attention over encoder + q = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, + name="q_transform") + assert encoder_output is not None + x = mp( + common_attention.scaled_dot_product_attention_simple, + q, encoder_output, encoder_output, + encoder_decoder_attention_bias_3d) + x = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, + name="o_transform") + elif layer_type == "multihead-att": + # multi-head attention + x = mp( + common_attention.multihead_attention, + x, + None, + self_attention_bias, # bias + hparams.multihead_attention_key_channels or hparams.hidden_size, + hparams.multihead_attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.multihead_attention_num_heads, + hparams.attention_dropout) + elif layer_type == "enc-multihead-att": + # multi-head attention + x = mp( + common_attention.multihead_attention, + x, + encoder_output, + encoder_decoder_attention_bias, # bias + hparams.multihead_attention_key_channels or hparams.hidden_size, + hparams.multihead_attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.multihead_attention_num_heads, + hparams.attention_dropout) + elif layer_type == "ffn": + x = mp( + common_layers.dense_relu_dense, x, + hparams.filter_size, hparams.hidden_size, + dropout=hparams.relu_dropout, + dropout_broadcast_dims=[relu_dropout_broadcast_dims] * mp.n) + else: + assert False, "unknown sublayer %s" % layer_type + return x + + +@registry.register_hparams +def transformer_symshard_base(): + """Set of hyperparameters.""" + hparams = common_hparams.basic_params1() + hparams.hidden_size = 256 + hparams.batch_size = 2048 + hparams.max_length = 0 + # All hyperparameters ending in "dropout" are automatically set to 0.0 + # when not in training mode. + hparams.layer_prepostprocess_dropout = 0.2 + hparams.add_hparam("attention_dropout", 0.1) + hparams.add_hparam("relu_dropout", 0.0) + hparams.add_hparam("relu_dropout_broadcast_dims", "1") + hparams.layer_prepostprocess_dropout = 0.1 + hparams.layer_prepostprocess_dropout_broadcast_dims = "1" # length + hparams.label_smoothing = 0.1 + hparams.clip_grad_norm = 0. # i.e. no gradient clipping + hparams.optimizer = "Adafactor" + hparams.learning_rate_schedule = "rsqrt_decay" + hparams.learning_rate_warmup_steps = 10000 + hparams.initializer_gain = 1.0 + hparams.initializer = "uniform_unit_scaling" + hparams.weight_decay = 0.0 + # TODO(noam): use this to control sharing. We now share always + hparams.shared_embedding_and_softmax_weights = True + # we only want one data shard. + hparams.no_data_parallelism = True + # bypass the symbol modality so that we can use model parallelism. + hparams.target_modality = "symbol:identity" + hparams.input_modalities = "inputs:symbol:identity" + hparams.add_hparam("filter_size", 1280) + hparams.add_hparam("mix_fraction", 0.5) + # attention-related flags + hparams.add_hparam("multihead_attention_num_heads", 4) + hparams.add_hparam("multihead_attention_key_channels", 0) + hparams.add_hparam("multihead_attention_value_channels", 0) + hparams.add_hparam("pos", "timing") # timing, none + hparams.add_hparam( + "encoder_layers", ("n,att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d") + hparams.add_hparam( + "decoder_layers", + ("n,att,m,d,a," "n,enc-att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d") + # Number of model shards - each one has separate parameters. + # Changing this number invalidates checkpoints. + hparams.add_hparam("num_model_shards", 8) + return hparams + + +@registry.register_hparams +def transformer_symshard_sh4(): + """4 shards instead of 8. Similar model size to transformer_tpu().""" + hparams = transformer_symshard_base() + hparams.num_model_shards = 4 + return hparams + + +@registry.register_hparams +def transformer_symshard_lm_0(): + """For language modeling - suggested problem languagemodel_lm1b8k_packed.""" + hparams = transformer_symshard_base() + hparams.label_smoothing = 0 + return hparams + + +@registry.register_hparams +def transformer_symshard_h4(): + """4 heads per shard.""" + hparams = transformer_symshard_base() + hparams.encoder_layers = ("n,multihead-att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d" + hparams.decoder_layers = ( + ("n,multihead-att,m,d,a," "n,enc-multihead-att,m,d,a," "n,ffn,m,d,a,") * 6 + + "n,d") + return hparams diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py index 6ad4e19a5..4b37528ea 100644 --- a/tensor2tensor/models/research/transformer_vae.py +++ b/tensor2tensor/models/research/transformer_vae.py @@ -18,25 +18,19 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function - +from functools import partial import math - # Dependency imports - -from six.moves import xrange # pylint: disable=redefined-builtin from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_image_attention as cia from tensor2tensor.layers import common_layers +from tensor2tensor.layers import discretization from tensor2tensor.models import transformer from tensor2tensor.utils import beam_search from tensor2tensor.utils import expert_utils from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model - - import tensorflow as tf -from tensorflow.python.training import moving_averages - _DO_SUMMARIES = True @@ -110,325 +104,6 @@ def top_k_experts(x, k, hparams): return gates, load_loss -def gumbel_sample(shape): - """Sample from the Gumbel distribution, protect from overflows.""" - uniform_samples = tf.random_uniform(shape, minval=0.00001, maxval=0.99998) - return -tf.log(-tf.log(uniform_samples)) - - -def dae(x, hparams, name): - with tf.variable_scope(name): - m = tf.layers.dense(x, hparams.v_size, name="mask") - if hparams.softmax_k > 0: - m, kl = top_k_softmax(m, hparams.softmax_k) - return m, m, 1.0 - tf.reduce_mean(kl) - logsm = tf.nn.log_softmax(m) - # Gumbel-softmax sample. - gumbel_samples = gumbel_sample(common_layers.shape_list(m)) - steps = hparams.kl_warmup_steps - gumbel_samples *= common_layers.inverse_exp_decay(steps // 5) * 0.5 - temperature = 1.2 - common_layers.inverse_lin_decay(steps) - # 10% of the time keep reasonably high temperature to keep learning. - temperature = tf.cond(tf.less(tf.random_uniform([]), 0.9), - lambda: temperature, - lambda: tf.random_uniform([], minval=0.5, maxval=1.0)) - s = tf.nn.softmax((logsm + gumbel_samples) / temperature) - m = tf.nn.softmax(m) - kl = - tf.reduce_max(logsm, axis=-1) - if _DO_SUMMARIES: - tf.summary.histogram("max-log", tf.reshape(kl, [-1])) - # Calculate the argmax and construct hot vectors. - maxvec = tf.reshape(tf.argmax(m, axis=-1), [-1]) - maxvhot = tf.stop_gradient(tf.one_hot(maxvec, hparams.v_size)) - # Add losses that prevent too few being used. - distrib = tf.reshape(logsm, [-1, hparams.v_size]) * maxvhot - d_mean = tf.reduce_mean(distrib, axis=[0], keep_dims=True) - d_variance = tf.reduce_mean(tf.square(distrib - d_mean), axis=[0]) - d_dev = - tf.reduce_mean(d_variance) - ret = s - if hparams.mode != tf.contrib.learn.ModeKeys.TRAIN: - ret = tf.reshape(maxvhot, common_layers.shape_list(s)) # Just hot @eval. - return m, ret, d_dev * 5.0 + tf.reduce_mean(kl) * 0.002 - - -def vae(x, z_size, name): - with tf.variable_scope(name): - mu = tf.layers.dense(x, z_size, name="mu") - log_sigma = tf.layers.dense(x, z_size, name="log_sigma") - shape = common_layers.shape_list(x) - epsilon = tf.random_normal([shape[0], shape[1], 1, z_size]) - z = mu + tf.exp(log_sigma / 2) * epsilon - kl = 0.5 * tf.reduce_mean( - tf.exp(log_sigma) + tf.square(mu) - 1. - log_sigma, axis=-1) - free_bits = z_size // 4 - kl_loss = tf.reduce_mean(tf.maximum(kl - free_bits, 0.0)) - return z, kl_loss, mu, log_sigma - - -def project_hidden(x, hparams): - """Project encoder hidden state into block_dim using projection tensors. - - Args: - x: Encoder hidden state of shape [-1, hidden_size] - hparams: Hparams - - Returns: - Projected states of shape [-1, num_blocks, block_dim]. - """ - x = tf.reshape(x, shape=[1, -1, hparams.hidden_size]) - x_tiled = tf.reshape( - tf.tile(x, multiples=[hparams.num_blocks, 1, 1]), - shape=[hparams.num_blocks, -1, hparams.hidden_size]) - x_projected = tf.matmul(x_tiled, hparams.projection_tensors) - x_projected = tf.transpose(x_projected, perm=[1, 0, 2]) - return x_projected - - -def slice_hidden(x, hparams): - """Slice encoder hidden state into block_dim. - - Args: - x: Encoder hidden state of shape [-1, hidden_size] - hparams: Hparams - - Returns: - Sliced states of shape [-1, num_blocks, block_dim]. - """ - assert hparams.num_blocks * hparams.block_dim == hparams.hidden_size - x_sliced = tf.reshape(x, shape=[-1, hparams.num_blocks, hparams.block_dim]) - return x_sliced - - -def nearest(x, means, hparams): - """Find the nearest means to elements in x.""" - x_reshaped = hparams.reshape_fn(x, hparams) - x_norm_sq = tf.reduce_sum(tf.square(x_reshaped), axis=-1, keep_dims=True) - means_norm_sq = tf.reduce_sum(tf.square(means), axis=-1, keep_dims=True) - scalar_prod = tf.matmul( - tf.transpose(x_reshaped, perm=[1, 0, 2]), - tf.transpose(means, perm=[0, 2, 1])) - scalar_prod = tf.transpose(scalar_prod, perm=[1, 0, 2]) - dist = x_norm_sq + tf.transpose( - means_norm_sq, perm=[2, 0, 1]) - 2 * scalar_prod - if hparams.random_top_k > 1: - _, top_k_idx = tf.nn.top_k(-dist, k=hparams.random_top_k) - nearest_idx = tf.gather( - top_k_idx, - tf.random_uniform( - [1], minval=0, maxval=hparams.random_top_k - 1, dtype=tf.int32), - axis=-1) - else: - nearest_idx = tf.argmax(-dist, axis=-1) - nearest_hot = tf.one_hot(nearest_idx, hparams.block_v_size) - shape = common_layers.shape_list(x) - shape[-1] = hparams.num_blocks - shape.append(hparams.block_v_size) - nearest_hot = tf.reshape(nearest_hot, shape=shape) - return tf.stop_gradient(nearest_hot) - - -def kmeans(x, means, hparams): - """Compute the nearest neighbors and the loss for training the embeddings.""" - x_means_hot = nearest(x, means, hparams) - x_means_hot_flat = tf.reshape(x_means_hot, - [-1, hparams.num_blocks, hparams.block_v_size]) - x_means = tf.matmul(tf.transpose(x_means_hot_flat, perm=[1, 0, 2]), means) - x_means = tf.transpose(x_means, [1, 0, 2]) - x_reshaped = hparams.reshape_fn(x, hparams) - q_loss = tf.reduce_mean(tf.square((tf.stop_gradient(x_reshaped) - x_means))) - e_loss = tf.reduce_mean(tf.square(x_reshaped - tf.stop_gradient(x_means))) - return x_means_hot, x_means, q_loss, e_loss - - -def bit_to_int(x_bit, nbits, base=2): - """Turn x_bit representing numbers bitwise (lower-endian) to int tensor.""" - x_l = tf.stop_gradient(tf.reshape(x_bit, [-1, nbits])) - x_labels = [] - for i in range(nbits): - x_labels.append(x_l[:, i] * tf.to_int32(base)**tf.to_int32(i)) - res = sum(x_labels) - return tf.to_int32(tf.reshape(res, common_layers.shape_list(x_bit)[:-1])) - - -def int_to_bit(x_int, nbits, base=2): - """Turn x_int representing numbers into a bitwise (lower-endian) tensor.""" - x_l = tf.expand_dims(x_int, axis=-1) - x_labels = [] - for i in range(nbits): - x_labels.append( - tf.floormod( - tf.floordiv(tf.to_int32(x_l), - tf.to_int32(base)**i), tf.to_int32(base))) - res = tf.concat(x_labels, axis=-1) - return tf.to_float(res) - - -def bottleneck(x, - hparams, - filter_size, - name, - means=None, - ema_count=None, - ema_means=None): - """Bottleneck.""" - if hparams.bottleneck_kind == "vq-vae": - assert means is not None - if hparams.ema: - assert ema_count is not None - assert ema_means is not None - - def embed(x): - """Embedding function; must be compatible with the code later.""" - with tf.variable_scope(name, reuse=tf.AUTO_REUSE): - if hparams.bottleneck_kind == "semhash": - c = int_to_bit(x, z_size) - h1a = tf.layers.dense(c, filter_size, name="vch1a") - h1b = tf.layers.dense(1.0 - c, filter_size, name="vch1b") - h1 = h1a + h1b - elif hparams.bottleneck_kind == "gumbel-softmax": - hot = tf.one_hot(x, hparams.v_size) - h1 = tf.layers.dense(hot, hparams.hidden_size, name="dae_dense") - elif hparams.bottleneck_kind == "vq-vae": - shape_x = common_layers.shape_list(x) - x_flat = tf.reshape(x, [-1, 1]) - c = int_to_bit(x_flat, nbits=hparams.z_size, base=2) - shape = common_layers.shape_list(c) - new_shape = shape - new_shape[-1] = hparams.num_blocks - new_shape.append(int(hparams.z_size / hparams.num_blocks)) - c = tf.to_int32(tf.reshape(c, shape=new_shape)) - c = bit_to_int( - c, - nbits=int(hparams.z_size / hparams.num_blocks), - base=2) - c_hot = tf.one_hot(c, depth=hparams.block_v_size, axis=-1) - c_hot_flat = tf.reshape( - c_hot, shape=[-1, hparams.num_blocks, hparams.block_v_size]) - h1 = tf.matmul(tf.transpose(c_hot_flat, perm=[1, 0, 2]), means) - h1 = tf.transpose(h1, perm=[1, 0, 2]) - new_shape = shape_x - new_shape.append(hparams.hidden_size) - h1 = tf.reshape(h1, new_shape) - elif hparams.bottleneck_kind == "rounding": - h1 = x - - h2 = tf.layers.dense(tf.nn.relu(h1), filter_size, name="vch2") - return tf.layers.dense(tf.nn.relu(h2), hparams.hidden_size, name="vcfin") - - with tf.variable_scope(name, reuse=tf.AUTO_REUSE): - z_size = hparams.z_size - l = tf.constant(0.0) - if hparams.bottleneck_kind == "dense": - c = tf.layers.dense(x, z_size, name="vcc") - h1 = tf.layers.dense(c, filter_size, name="vch1") - if hparams.bottleneck_kind == "vae": - c, l, _, _ = vae(x, z_size, "vae") - h1 = tf.layers.dense(c, filter_size, name="vch1") - if hparams.bottleneck_kind == "semhash": - c = tf.layers.dense(x, z_size, name="vcc") - y_clean = common_layers.saturating_sigmoid(c) - if _DO_SUMMARIES: - tf.summary.histogram("y_clean", tf.reshape(y_clean, [-1])) - if hparams.noise_dev > 0 and hparams.mode == tf.estimator.ModeKeys.TRAIN: - dev = hparams.noise_dev - noise = tf.truncated_normal(common_layers.shape_list(c), - mean=0.0, stddev=dev) - y = common_layers.saturating_sigmoid(c + noise) - else: - y = y_clean - d = tf.to_float(tf.less(0.5, y)) - y_discrete = tf.stop_gradient(d) + y - tf.stop_gradient(y) - pd = common_layers.inverse_exp_decay(hparams.startup_steps * 2) - pd *= hparams.d_mix - pd = pd if hparams.mode == tf.estimator.ModeKeys.TRAIN else 1.0 - c = tf.where(tf.less(tf.random_uniform( - [common_layers.shape_list(y)[0]]), pd), y_discrete, y) - h1a = tf.layers.dense(c, filter_size, name="vch1a") - h1b = tf.layers.dense(1.0 - c, filter_size, name="vch1b") - h1 = h1a + h1b - dx = tf.to_int32(tf.stop_gradient(d)) - c = bit_to_int(dx, z_size) - if hparams.bottleneck_kind == "gumbel-softmax": - _, hot, l = dae(x, hparams, name) - c = tf.argmax(hot, axis=-1) - h1 = tf.layers.dense(hot, hparams.hidden_size, name="dae_dense") - if hparams.bottleneck_kind == "vq-vae": - x_means_hot, x_means, q_loss, e_loss = kmeans(x, means, hparams) - - # Get the discrete latent represenation - x_means_idx = tf.argmax(x_means_hot, axis=-1) - - # Get the binary representation - x_means_bits = int_to_bit( - x_means_idx, - nbits=int(hparams.z_size / hparams.num_blocks), - base=2) - shape = common_layers.shape_list(x_means_bits) - new_shape = shape[:-1] - new_shape[-1] = hparams.z_size - x_means_bits = tf.reshape(x_means_bits, shape=new_shape) - c = bit_to_int( - tf.to_int32(x_means_bits), - nbits=hparams.z_size, - base=2) - - # Update the ema variables - if hparams.ema: - tf.logging.info("Using EMA with beta = {}".format(hparams.beta)) - updated_ema_count = moving_averages.assign_moving_average( - ema_count, - tf.reduce_sum( - tf.reshape( - x_means_hot, - shape=[-1, hparams.num_blocks, hparams.block_v_size]), - axis=0), - hparams.decay, - zero_debias=False) - - x_means_hot_flat = tf.reshape( - x_means_hot, shape=[-1, hparams.num_blocks, hparams.block_v_size]) - x_reshaped = hparams.reshape_fn(x, hparams) - dw = tf.matmul( - tf.transpose(x_means_hot_flat, perm=[1, 2, 0]), - tf.transpose(x_reshaped, perm=[1, 0, 2])) - updated_ema_means = moving_averages.assign_moving_average( - ema_means, dw, hparams.decay, zero_debias=False) - n = tf.reduce_sum(updated_ema_count, axis=-1, keep_dims=True) - updated_ema_count = ((updated_ema_count + hparams.epsilon) / - (n + hparams.v_size * hparams.epsilon) * n) - updated_ema_means /= tf.expand_dims(updated_ema_count, axis=-1) - - with tf.control_dependencies([e_loss]): - update_means = tf.assign(means, updated_ema_means) - with tf.control_dependencies([update_means]): - l = hparams.beta * e_loss - else: - l = q_loss + hparams.beta * e_loss - - x_reshaped = hparams.reshape_fn(x, hparams) - shape = common_layers.shape_list(x) - x_means = tf.reshape(x_means, shape) - x_reshaped = tf.reshape(x_reshaped, shape) - h1 = x_reshaped + tf.stop_gradient(x_means - x_reshaped) - - if hparams.bottleneck_kind == "rounding": - h = tf.layers.dense(x, 1, name="vcc") - - # Make h between 0 and 1 - h = tf.sigmoid(h) - - # Multiply by z_size to get it between [0, z_size] - h *= hparams.v_size - - # Use the rounding bottleneck - h1 = h + tf.stop_gradient(tf.round(h) - h) - c = tf.squeeze(tf.round(h), axis=-1) - c = tf.to_int32(c) - h2 = tf.layers.dense(tf.nn.relu(h1), filter_size, name="vch2") - res = tf.layers.dense(tf.nn.relu(h2), hparams.hidden_size, name="vcfin") - return res, c, l, embed - - def compress(x, c, is_2d, hparams, name): """Compress.""" with tf.variable_scope(name): @@ -627,10 +302,7 @@ def ae_transformer_internal(inputs, target_space, hparams, cache=None, - predict_mask=1.0, - means=None, - ema_count=None, - ema_means=None): + predict_mask=1.0): """AE Transformer, main step used for training.""" # Summaries break with the do_refine cond, turn them off in that case. global _DO_SUMMARIES @@ -657,7 +329,7 @@ def ae_transformer_internal(inputs, # flatten here original_targets_shape = tf.shape(targets) if hparams.task == "image": - cia.maybe_reshape_4d_to_3d(targets, hparams) + cia.maybe_reshape_4d_to_3d(targets) if hparams.task == "translate": max_targets_len_from_inputs = tf.concat([inputs, inputs], axis=1) else: @@ -666,12 +338,17 @@ def ae_transformer_internal(inputs, targets, _ = common_layers.pad_to_same_length( targets, max_targets_len_from_inputs, final_length_divisible_by=2**hparams.num_compress_steps) - targets_c = compress(targets, inputs, False, hparams, "compress") + if hparams.ae_input: + targets_c = compress(targets, inputs, False, hparams, "compress") + else: + targets_c = compress(targets, None, False, hparams, "compress") if hparams.mode != tf.estimator.ModeKeys.PREDICT: # Compress and bottleneck. - latents_dense, latents_discrete, extra_loss, embed = bottleneck( - targets_c, hparams, - hparams.compress_filter_size, "vc", means, ema_count, ema_means) + latents_dense, latents_discrete, extra_loss, embed = hparams.bottleneck( + x=targets_c, + filter_size=hparams.compress_filter_size, + name="vc", + mode=hparams.mode) if _DO_SUMMARIES: tf.summary.histogram("b0", tf.reshape(latents_discrete[:, 0, :], [-1])) pc = common_layers.inverse_exp_decay(hparams.startup_steps) @@ -696,9 +373,11 @@ def ae_transformer_internal(inputs, losses["latent_pred"] = tf.reduce_mean((inputs_c - targets_c)**2) * 20 def bn_inputs(): with tf.variable_scope(tf.get_variable_scope(), reuse=True): - bn, _, _, _ = bottleneck(inputs_c, hparams, - hparams.compress_filter_size, "vc", means, - ema_count, ema_means) + bn, _, _, _ = hparams.bottleneck( + x=inputs_c, + filter_size=hparams.compress_filter_size, + name="vc", + mode=hparams.mode) return bn pbn = 0.8 if hparams.mode == tf.estimator.ModeKeys.TRAIN else 1.0 inputs_c = tf.cond(tf.less(tf.random_uniform([]), pbn), @@ -710,14 +389,15 @@ def bn_inputs(): else: if hparams.bottleneck_kind in ["dense", "vae"]: inputs_c = decode_transformer(inputs, ed, targets_c, hparams, "dec_c") - latents_dense, _, _, _ = bottleneck( - inputs_c, hparams, hparams.compress_filter_size, "vc", - means, ema_count, ema_means) + latents_dense, _, _, _ = hparams.bottleneck( + x=inputs_c, + filter_size=hparams.compress_filter_size, + name="vc", + mode=hparams.mode) else: latent_len = common_layers.shape_list(targets_c)[1] - _, _, _, embed = bottleneck(targets_c, hparams, - hparams.compress_filter_size, "vc", means, - ema_count, ema_means) + _, _, _, embed = hparams.bottleneck( + x=targets_c, filter_size=hparams.compress_filter_size, name="vc") latents_dense = tf.zeros_like(targets_c[:, :latent_len, :, :]) if cache is None: cache = ae_latent_sample( @@ -785,56 +465,76 @@ def __init__(self, *args, **kwargs): super(TransformerAE, self).__init__(*args, **kwargs) self.predict_mask = 1.0 - # Define the embeddings if we are using vq-vae - self.means = None - self.ema_count = None - self.ema_means = None - if self._hparams.bottleneck_kind == "vq-vae": - # Check that num_blocks exactly divides hidden_size and v_size - assert self._hparams.hidden_size % self._hparams.num_blocks == 0 - assert self._hparams.v_size % self._hparams.num_blocks == 0 - - self._hparams.block_dim = int( - self._hparams.hidden_size // self._hparams.num_blocks) - self._hparams.block_v_size = 2**( - self._hparams.z_size / self._hparams.num_blocks) - self._hparams.block_v_size = int(self._hparams.block_v_size) + # Define bottleneck function + self._hparams.bottleneck = partial( + discretization.discrete_bottleneck, + hidden_size=self._hparams.hidden_size, + z_size=self._hparams.z_size, + filter_size=self._hparams.filter_size, + startup_steps=self.hparams.startup_steps, + bottleneck_kind=self._hparams.bottleneck_kind, + num_blocks=self._hparams.num_blocks, + reshape_method=self._hparams.reshape_method, + beta=self._hparams.beta, + noise_dev=self._hparams.noise_dev, + decay=self._hparams.decay, + discrete_mix=self._hparams.d_mix, + random_top_k=self._hparams.random_top_k, + epsilon=self._hparams.epsilon, + softmax_k=self._hparams.softmax_k, + kl_warmup_steps=self._hparams.kl_warmup_steps, + ema=self._hparams.ema, + summary=_DO_SUMMARIES, + dp_strength=self._hparams.dp_strength, + dp_decay=self._hparams.dp_decay, + dp_alpha=self._hparams.dp_alpha) + + # Set the discretization bottleneck specific things here + if self._hparams.bottleneck_kind == "dvq": + block_dim = int(self._hparams.hidden_size // self._hparams.num_blocks) + block_v_size = 2**(self._hparams.z_size / self._hparams.num_blocks) + block_v_size = int(block_v_size) if self._hparams.reshape_method == "project": - tf.logging.info("Using projections for decomposed vq-vae") + tf.logging.info("Using projections for DVQ") tf.logging.info("Trainable projections = {}".format( self._hparams.trainable_projections)) - self._hparams.projection_tensors = tf.get_variable( + + projection_tensors = tf.get_variable( name="projection", shape=[ - self._hparams.num_blocks, self._hparams.hidden_size, - self._hparams.block_dim + self._hparams.num_blocks, self._hparams.hidden_size, block_dim ], initializer=tf.contrib.layers.xavier_initializer(), trainable=self._hparams.trainable_projections) - self._hparams.reshape_fn = project_hidden + + self._hparams.bottleneck = partial( + self._hparams.bottleneck, projection_tensors=projection_tensors) elif self._hparams.reshape_method == "slice": - tf.logging.info("Using slices for decomposed vq-vae") - self._hparams.reshape_fn = slice_hidden + tf.logging.info("Using slices for DVQ") else: raise ValueError("Unknown reshape method") - self.means = tf.get_variable( + means = tf.get_variable( name="means", - shape=[ - self._hparams.num_blocks, self._hparams.block_v_size, - self._hparams.block_dim - ], + shape=[self._hparams.num_blocks, block_v_size, block_dim], initializer=tf.uniform_unit_scaling_initializer()) # Create the shadow variables if we are using EMA if self._hparams.ema: - self.ema_count = tf.get_variable( - "ema_count", [self._hparams.num_blocks, self._hparams.block_v_size], + ema_count = tf.get_variable( + "ema_count", [self._hparams.num_blocks, block_v_size], initializer=tf.constant_initializer(0)) - with tf.colocate_with(self.means): - self.ema_means = tf.get_variable( - "ema_means", initializer=self.means.initialized_value()) + with tf.colocate_with(means): + ema_means = tf.get_variable( + "ema_means", initializer=means.initialized_value()) + + # Update bottleneck + self._hparams.bottleneck = partial( + self._hparams.bottleneck, + means=means, + ema_count=ema_count, + ema_means=ema_means) @property def has_input(self): @@ -852,10 +552,7 @@ def body(self, features): features["target_space_id"], self._hparams, features.get("cache_raw", None), - predict_mask=self.predict_mask, - means=self.means, - ema_count=self.ema_count, - ema_means=self.ema_means) + predict_mask=self.predict_mask) return res, loss def prepare_features_for_infer(self, features): @@ -870,8 +567,7 @@ def prepare_features_for_infer(self, features): targets = tf.zeros([beam_batch_size, 1, 1, self._hparams.hidden_size]) with tf.variable_scope("body"): _, _, cache = ae_transformer_internal( - inputs, targets, features["target_space_id"], self._hparams, - self.means, self.ema_count, self.ema_means) + inputs, targets, features["target_space_id"], self._hparams) features["cache_raw"] = cache def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1, @@ -935,13 +631,19 @@ def transformer_ae_small(): hparams.add_hparam("z_size", 14) hparams.add_hparam("noise_dev", 0.5) hparams.add_hparam("d_mix", 0.5) - # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae. + # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, dvq. hparams.add_hparam("bottleneck_kind", "semhash") hparams.add_hparam("num_blocks", 1) hparams.add_hparam("num_decode_blocks", 1) - # Reshape method for decomposed vq-vae: slice, project + # Reshape method for DVQ: slice, project hparams.add_hparam("reshape_method", "slice") hparams.add_hparam("trainable_projections", False) + # Add option to pass the input to the autoencoder + hparams.add_hparam("ae_input", False) + # Hparams for Dirichlet process process + hparams.add_hparam("dp_alpha", 0.5) + hparams.add_hparam("dp_strength", 0.25) + hparams.add_hparam("dp_decay", 1.0) hparams.add_hparam("unmasked_percentage", 0.1) hparams.add_hparam("do_ae", True) hparams.add_hparam("do_mask", True) @@ -1086,3 +788,12 @@ def transformer_ae_a8(): hparams.optimizer = "Adafactor" hparams.noise_dev = 0.5 return hparams + + +@registry.register_hparams +def transformer_ae_base_tpu(): + """Base config adjusted for TPU.""" + hparams = transformer_ae_base() + transformer.update_hparams_for_tpu(hparams) + hparams.batch_size = 512 + return hparams diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 11d446f5b..09b252291 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -882,7 +882,8 @@ def transformer_base_v1(): hparams.max_length = 256 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 - hparams.learning_rate_schedule = "linear_warmup_rsqrt_decay" + hparams.learning_rate_schedule = "legacy" + hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 @@ -943,6 +944,11 @@ def transformer_base(): # transformer_base_v2. hparams = transformer_base_v2() hparams.optimizer_adam_beta2 = 0.997 + # New way of specifying learning rate schedule. + # Equivalent to previous version. + hparams.learning_rate_schedule = ( + "constant*linear_warmup*rsqrt_decay*rsqrt_hidden_size") + hparams.learning_rate_constant = 2.0 return hparams @@ -1279,7 +1285,10 @@ def update_hparams_for_tpu(hparams): """Change hparams to be compatible with TPU training.""" # Adafactor uses less memory than Adam. + # switch to Adafactor with its recommended learning rate scheme. hparams.optimizer = "Adafactor" + hparams.learning_rate_schedule = "rsqrt_decay" + hparams.learning_rate_warmup_steps = 10000 # Avoid an expensive concat on TPU. # >1 shards helps with faster parameter distribution on multi-GPU machines diff --git a/tensor2tensor/test_data/example_usr_dir/my_submodule.py b/tensor2tensor/test_data/example_usr_dir/my_submodule.py index 70929afbc..e3ffd962c 100644 --- a/tensor2tensor/test_data/example_usr_dir/my_submodule.py +++ b/tensor2tensor/test_data/example_usr_dir/my_submodule.py @@ -56,10 +56,10 @@ def dataset_splits(self): # 10% evaluation data return [{ "split": problem.DatasetSplit.TRAIN, - "shards": 90, + "shards": 9, }, { "split": problem.DatasetSplit.EVAL, - "shards": 10, + "shards": 1, }] def generate_samples(self, data_dir, tmp_dir, dataset_split): diff --git a/tensor2tensor/test_data/example_usr_dir/requirements.txt b/tensor2tensor/test_data/example_usr_dir/requirements.txt new file mode 100644 index 000000000..3678319be --- /dev/null +++ b/tensor2tensor/test_data/example_usr_dir/requirements.txt @@ -0,0 +1 @@ +gutenberg diff --git a/tensor2tensor/test_data/example_usr_dir/setup.py b/tensor2tensor/test_data/example_usr_dir/setup.py deleted file mode 100644 index ad3701bb2..000000000 --- a/tensor2tensor/test_data/example_usr_dir/setup.py +++ /dev/null @@ -1,34 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Example setup.py for a t2t_usr_dir launching on Cloud ML Engine. - -This is only necessary if you have additional required pip packages for the -import of your usr_dir, and only if you're launching t2t-trainer on Cloud ML -Engine with the --cloud_mlengine flag. - -Note that the call to setup uses find_packages() and that the location of this -file is alongside the __init__.py file that imports my_submodule. -""" -from setuptools import find_packages -from setuptools import setup -setup( - name='DummyUsrDirPackage', - version='0.1', - packages=find_packages(), - install_requires=[ - 'gutenberg', - ], -) diff --git a/tensor2tensor/utils/adafactor.py b/tensor2tensor/utils/adafactor.py new file mode 100644 index 000000000..de14aff52 --- /dev/null +++ b/tensor2tensor/utils/adafactor.py @@ -0,0 +1,308 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Optimization.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import tensorflow as tf + + +class AdafactorOptimizer(tf.train.Optimizer): + """Optimizer that implements the Adafactor algorithm. + + Adafactor is described in TODO(noam): post paper to arxiv. + + Adafactor is most similar to Adam (Kingma and Ba), the major differences are: + + 1. For a two-dimensional AxB weight matrix, Adafactor uses only A+B auxiliary + parameters to maintain the second-moment estimator, instead of AB. + This is advantagous on memory-limited systems. In addition, beta1 + (momentum) is set to zero by default, saving an additional auxiliary + parameter per weight. + + 2. Adafactor incorporates "update-clipping" - a scale-invariant analog of + gradient clipping. This adds stability + + 3. Adafactor does not require an external "learning rate". By default, it + incorporates a relative-update-scale schedule, corresponding to + inverse-square-root learning-rate-decay in ADAM. We hope this works well + for most applications. + + ALGORITHM: + + parameter -= absolute_update_scale * clip(grad / grad_scale) + + where: + + absolute_update_scale := relative_update_scale * parameter_scale + relative_update_scale := min((step_num + 1)**-0.5, 1e-2) + parameter_scale := max(rms(var)), 1e-3) + clip(x) := x / max(1.0, rms(x)) + grad_scale := tf.sqrt(v) (v is the second-moment estimator) + + The second-moment estimator v is maintained in a manner similar to Adam: + We initialize + ``` + if var is 2-dimensional: + v_r <- zeros([num_rows]) + v_c <- zeros([num_cols]) + else: + v <- zeros(shape(var)) + ``` + + The update rule is as follows: + ``` + decay_rate = 1 - (step_num + 1) ^ -0.8 + grad_squared = tf.square(grad) + epsilon + if var is 2-dimensional: + v_r <- decay_rate * v_r + (1 - decay_rate) * reduce_mean(grad_squared, 1) + v_c <- decay_rate * v_c + (1 - decay_rate) * reduce_mean(grad_squared, 0) + v = outer_prod(v_r, v_c) / reduce_mean(v_r) + else: + v <- decay_rate * v + (1 - decay_rate) * grad_squared + ``` + + + Several parts of this algorithm are configurable from the initializer. + + multiply_by_parameter_scale: If True, then compute absolute_update_scale + as described above. If False, let absolute_update_scale be the externally + supplied learning_rate. + learning_rate: represents relative_update_scale if + multiply_by_parameter_scale==True, or absolute_update_scale if + multiply_by_parameter_scale==False. + decay_rate: Decay rate of the second moment estimator (varies by step_num). + This should be set to a function such that: + 1-1/(step_num + 1) <= decay_rate(step_num) < 1.0 + beta1: enables momentum, as in Adam. Uses extra memory if nonzero. + clipping_threshold: should be >=1.0 or None for no update clipping + factored: whether to factor the second-moment estimator. True means + less memory usage. + + TODO(noam): we should also apply the 2d logic to the two final dimensions. + of >2d convolutional kernels. + """ + + def __init__(self, + multiply_by_parameter_scale=True, + learning_rate=None, + decay_rate=None, + beta1=0.0, + clipping_threshold=1.0, + factored=True, + use_locking=False, + name="Adafactor"): + """Construct a new Adafactor optimizer. + + See class comment. + + Args: + multiply_by_parameter_scale: a boolean + learning_rate: an optional Scalar. + decay_rate: an optional Scalar. + beta1: a float value between 0 and 1 + clipping_threshold: an optional float >= 1 + factored: a boolean - whether to use factored second-moment estimator + for 2d variables + use_locking: If True use locks for update operations. + name: Optional name for the operations created when applying gradients. + Defaults to "AdafactorOptimizer". + + Raises: + ValueError: if absolute_update_scale and relative_update_scale_fn are both + present or both absent. + """ + super(AdafactorOptimizer, self).__init__(use_locking, name) + self._multiply_by_parameter_scale = multiply_by_parameter_scale + if learning_rate is None: + learning_rate = self._learning_rate_default(multiply_by_parameter_scale) + self._learning_rate = learning_rate + if decay_rate is None: + decay_rate = self._decay_rate_default() + self._decay_rate = decay_rate + self._beta1 = beta1 + self._clipping_threshold = clipping_threshold + self._factored = factored + + def _should_use_factored_second_moment_estimate(self, shape): + """Should we use a factored second moment estimator. + + Based on the shape of the variable. + + Args: + shape: a list of integers + Returns: + a boolean + """ + return self._factored and len(shape) == 2 + + def _create_slots(self, var_list): + for var in var_list: + shape = var.get_shape().as_list() + if self._beta1: + self._zeros_slot(var, "m", self._name) + if self._should_use_factored_second_moment_estimate(shape): + r_val = tf.zeros([shape[0]], dtype=tf.float32) + c_val = tf.zeros([shape[1]], dtype=tf.float32) + self._get_or_make_slot(var, r_val, "vr", self._name) + self._get_or_make_slot(var, c_val, "vc", self._name) + else: + self._zeros_slot(var, "v", self._name) + + def _apply_dense(self, grad, var): + return self._resource_apply_dense(grad, var) + + def _parameter_scale(self, var): + """Estimate the scale of the parameters from the current values. + + We include a minimum value of 0.001 to give it a chance to escape 0 + if it was zero-initialized. + + Instead of using the value, we could impute the scale from the shape, + as initializers do. + + Args: + var: a variable or Tensor. + Returns: + a Scalar + """ + return tf.maximum(reduce_rms(var), 0.001) + + def _resource_apply_dense(self, grad, var): + grad_squared = tf.square(grad) + 1e-30 + grad_squared_mean = tf.reduce_mean(grad_squared) + decay_rate = self._decay_rate + update_scale = self._learning_rate + if self._multiply_by_parameter_scale: + update_scale *= self._parameter_scale(var) + # HACK: Make things dependent on grad. + # This confounds the XLA rewriter and keeps it from fusing computations + # across different variables. This fusion is a bad for HBM usage, since + # it causes the gradients to persist in memory. + decay_rate += grad_squared_mean * 1e-30 + update_scale += grad_squared_mean * 1e-30 + # END HACK + mixing_rate = 1.0 - decay_rate + shape = var.get_shape().as_list() + updates = [] + if self._should_use_factored_second_moment_estimate(shape): + grad_squared_row_mean = tf.reduce_mean(grad_squared, 1) + grad_squared_col_mean = tf.reduce_mean(grad_squared, 0) + vr = self.get_slot(var, "vr") + new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) + vc = self.get_slot(var, "vc") + new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) + vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking) + vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking) + updates = [vr_update, vc_update] + long_term_mean = tf.reduce_mean(new_vr) + r_factor = tf.rsqrt(new_vr / long_term_mean) + c_factor = tf.rsqrt(new_vc) + x = grad * tf.expand_dims(r_factor, 1) * tf.expand_dims(c_factor, 0) + else: + v = self.get_slot(var, "v") + new_v = decay_rate * v + mixing_rate * grad_squared + v_update = tf.assign(v, new_v, use_locking=self._use_locking) + updates = [v_update] + x = grad * tf.rsqrt(new_v) + if self._clipping_threshold is not None: + clipping_denom = tf.maximum(1.0, reduce_rms(x) / self._clipping_threshold) + x /= clipping_denom + subtrahend = update_scale * x + if self._beta1: + m = self.get_slot(var, "m") + new_m = self._beta1 * m + (1.0 - self._beta1) * subtrahend + updates.append(tf.assign(m, new_m, use_locking=self._use_locking)) + subtrahend = new_m + var_update = tf.assign_sub(var, subtrahend, use_locking=self._use_locking) + updates = [var_update] + updates + return tf.group(*updates) + + def _decay_rate_default(self): + return adafactor_decay_rate_pow(0.8) + + def _learning_rate_default(self, multiply_by_parameter_scale): + learning_rate = tf.minimum(tf.rsqrt(step_num() + 1.0), 0.01) + if not multiply_by_parameter_scale: + learning_rate *= 0.05 + return learning_rate + + +def adafactor_decay_rate_adam(beta2): + """Second-moment decay rate like Adam, subsuming the correction factor. + + Args: + beta2: a float between 0 and 1 + Returns: + a scalar + """ + t = tf.to_float(tf.train.get_or_create_global_step()) + 1.0 + decay = beta2 * (1.0 - tf.pow(beta2, t - 1.0)) / (1.0 - tf.pow(beta2, t)) + # decay = tf.cond(tf.equal(t, 1.0), lambda: beta2, lambda: decay) + return decay + + +def adafactor_decay_rate_pow(exponent): + """Second moment decay rate where memory-length grows as step_num^exponent. + + Args: + exponent: a float between 0 and 1 + Returns: + a scalar + """ + return 1.0 - tf.pow((step_num() + 1.0), -exponent) + + +def step_num(): + return tf.to_float(tf.train.get_or_create_global_step()) + + +def adafactor_optimizer_from_hparams(hparams, lr): + """Create an Adafactor optimizer based on model hparams. + + Args: + hparams: model hyperparameters + lr: learning rate scalar. + Returns: + an AdafactorOptimizer + Raises: + ValueError: on illegal values + """ + if hparams.optimizer_adafactor_decay_type == "Adam": + decay_rate = adafactor_decay_rate_adam( + hparams.optimizer_adafactor_beta2) + elif hparams.optimizer_adafactor_decay_type == "pow": + decay_rate = adafactor_decay_rate_pow( + hparams.optimizer_adafactor_memory_exponent) + else: + raise ValueError("unknown optimizer_adafactor_decay_type") + return AdafactorOptimizer( + multiply_by_parameter_scale=( + hparams.optimizer_adafactor_multiply_by_parameter_scale), + learning_rate=lr, + decay_rate=decay_rate, + beta1=hparams.optimizer_adafactor_beta1, + clipping_threshold=hparams.optimizer_adafactor_clipping_threshold, + factored=hparams.optimizer_adafactor_factored, + use_locking=False, + name="Adafactor") + + +def reduce_rms(x): + return tf.sqrt(tf.reduce_mean(tf.square(x))) diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py old mode 100644 new mode 100755 index 1d9e1c591..9d0cc0f4a --- a/tensor2tensor/utils/cloud_mlengine.py +++ b/tensor2tensor/utils/cloud_mlengine.py @@ -36,15 +36,20 @@ # TODO(rsepassi): # * Enable multi-machine sync/async training -SETUP_PY = """ + +def get_setup_file(name, packages=None): + if not packages: + packages = [] + return """ from setuptools import find_packages from setuptools import setup setup( - name='DummyUsrDirPackage', + name='{name}', version='0.1', packages=find_packages(), + install_requires={pypi_packages} ) -""" +""".format(name=name, pypi_packages=str(list(packages))) def job_dir(): @@ -52,6 +57,15 @@ def job_dir(): return getattr(FLAGS, 'job-dir', '') or getattr(FLAGS, 'job_dir', '') +def get_requirements(usr_dir): + requirements_file = os.path.join(usr_dir, 'requirements.txt') + if not tf.gfile.Exists(requirements_file): + return [] + with tf.gfile.Open(requirements_file) as f: + pkg_list = f.readlines() + return [pkg.strip() for pkg in pkg_list if 'tensor2tensor' not in pkg] + + def flags_as_args(): """Convert FLAGS to list of args suitable for passing on cmd line.""" if hasattr(FLAGS, 'flag_values_dict'): @@ -77,27 +91,32 @@ def flags_as_args(): def machine_config(num_gpus=1, use_tpu=False, master_type=None): """Return dict specifying machine config for trainingInput.""" - scale_tier = 'BASIC_GPU' if use_tpu: - scale_tier = 'BASIC_TPU' + master_type = 'standard_tpu' elif num_gpus <= 0: - scale_tier = 'BASIC' - elif num_gpus > 1: - scale_tier = 'CUSTOM' - - config = {'scaleTier': scale_tier} - - if scale_tier == 'CUSTOM': - assert num_gpus > 1 - if num_gpus not in [4, 8]: + master_type = master_type or 'standard' + cpu_types = ['standard', 'large_model', 'complex_model_s', + 'complex_model_m', 'complex_model_l'] + if master_type not in cpu_types: + raise ValueError('Expected `cloudml_engine_master_type` to be one of %s ' + 'when `worker_gpu` <= 0, found %s.', str(cpu_types), + master_type) + elif num_gpus >= 1: + if num_gpus == 1: + if master_type != 'standard_gpu': + master_type = 'standard_p100' + elif num_gpus == 4: + if master_type != 'complex_model_m_gpu': + master_type = 'complex_model_m_p100' + elif num_gpus == 8: + master_type = 'complex_model_l_gpu' + else: raise ValueError('Must use exactly 1, 4, or 8 GPUs.') - config['masterType'] = ('complex_model_m_gpu' - if num_gpus == 4 else 'complex_model_l_gpu') - - if master_type: - config['masterType'] = master_type - - return config + assert master_type + return { + 'scaleTier': 'CUSTOM', + 'masterType': master_type + } def configure_job(): @@ -131,9 +150,6 @@ def configure_job(): FLAGS.autotune_parallel_trials, ) - if training_input['scaleTier'] == 'CUSTOM': - assert 'masterType' in training_input - timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') job_name = '%s_%s_t2t_%s' % (FLAGS.model, FLAGS.problems, timestamp) job_spec = {'jobId': job_name, 'trainingInput': training_input} @@ -173,7 +189,38 @@ def _tar_and_copy(src_dir, target_dir): def tar_and_copy_t2t(train_dir): """Tar Tensor2Tensor and cp to train_dir.""" tf.logging.info('Tarring and pushing local Tensor2Tensor package.') - t2t_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + + output = cloud.shell_output('pip show tensor2tensor').split('\n') + assert output[1].startswith('Version') + assert output[7].startswith('Location') + t2t_version = output[1].split(':')[1].strip() + t2t_dir = output[7].split(':')[1].strip() + + # A local installation cloned from GitHub will have a setup.py file and a docs + # folder + is_local_t2t = all([ + tf.gfile.Exists(os.path.join(t2t_dir, fname)) + for fname in ['setup.py', 'docs/cloud_mlengine.md'] + ]) + + if is_local_t2t: + tf.logging.info('Found local T2T installation. Tarring directory %s', + t2t_dir) + else: + # PyPI installation + # Create a folder with just a setup.py file pointing to the right version + tf.logging.info('Found PyPI T2T installation. Launching tensor2tensor==%s', + t2t_version) + t2t_dir = os.path.join(tempfile.gettempdir(), 'tensor2tensor_tmp') + shutil.rmtree(t2t_dir, ignore_errors=True) + os.mkdir(t2t_dir) + setup_fname = os.path.join(t2t_dir, 'setup.py') + setup_file_str = get_setup_file( + name='DummyT2TPackage', + packages=['tensor2tensor==%s' % t2t_version] + ) + with tf.gfile.Open(setup_fname, 'w') as f: + f.write(setup_file_str) t2t_tar = _tar_and_copy(t2t_dir, train_dir) return t2t_tar @@ -189,13 +236,12 @@ def tar_and_copy_usr_dir(usr_dir, train_dir): shutil.copytree(usr_dir, tmp_usr_dir) # Insert setup.py if one does not exist top_setup_fname = os.path.join(top_dir, 'setup.py') - usr_setup_fname = os.path.join(tmp_usr_dir, 'setup.py') - if tf.gfile.Exists(usr_setup_fname): - tf.gfile.Copy(usr_setup_fname, top_setup_fname) - tf.gfile.Remove(usr_setup_fname) - else: - with tf.gfile.Open(top_setup_fname, 'w') as f: - f.write(SETUP_PY) + setup_file_str = get_setup_file( + name='DummyUsrDirPackage', + packages=get_requirements(usr_dir) + ) + with tf.gfile.Open(top_setup_fname, 'w') as f: + f.write(setup_file_str) usr_tar = _tar_and_copy(top_dir, train_dir) return usr_tar diff --git a/tensor2tensor/utils/learning_rate.py b/tensor2tensor/utils/learning_rate.py new file mode 100644 index 000000000..169b59348 --- /dev/null +++ b/tensor2tensor/utils/learning_rate.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Optimization.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +import tensorflow as tf + + +def learning_rate_factor(name, step_num, hparams): + if name == "constant": + return hparams.learning_rate_constant + elif name == "linear_warmup": + return tf.minimum(1.0, step_num / hparams.learning_rate_warmup_steps) + elif name == "rsqrt_decay": + return tf.rsqrt(tf.maximum(step_num, hparams.learning_rate_warmup_steps)) + elif name == "rsqrt_hidden_size": + return hparams.hidden_size ** -0.5 + elif name == "legacy": + return legacy_learning_rate_schedule(hparams) + else: + raise ValueError("unknown learning rate factor %s" % name) + + +def learning_rate_schedule(hparams): + """Learning rate schedule based on hparams.""" + step_num = tf.to_float(tf.train.get_or_create_global_step()) + schedule_string = hparams.learning_rate_schedule + names = schedule_string.split("*") + names = [name.strip() for name in names if name.strip()] + ret = 1.0 + for name in names: + ret *= learning_rate_factor(name, step_num, hparams) + return ret + + +def legacy_learning_rate_schedule(hparams): + """Backwards-compatible learning-rate schedule.""" + step_num = tf.to_float(tf.train.get_or_create_global_step()) + warmup_steps = tf.to_float(hparams.learning_rate_warmup_steps) + if hparams.learning_rate_decay_scheme == "noam": + ret = 5000.0 * hparams.hidden_size**-0.5 * tf.minimum( + (step_num + 1) * warmup_steps**-1.5, (step_num + 1)**-0.5) + else: + warmup_steps = hparams.learning_rate_warmup_steps + warmup = _learning_rate_warmup(warmup_steps) + decay = _learning_rate_decay(hparams, warmup_steps) + ret = tf.where(step_num < warmup_steps, warmup, decay) + optimizer_correction = 0.002 if "Adam" in hparams.optimizer else 1.0 + return ret * optimizer_correction * hparams.learning_rate + + +def _legacy_sqrt_decay(step): + """Decay like 1 / sqrt(step), multiplied by 500 to normalize.""" + return 500.0 / tf.sqrt(tf.maximum(step, 1.0)) + + +def _piecewise_learning_rate(step, boundaries, values): + """Scale learning rate according to the given schedule. + + Multipliers are not cumulative. + + Args: + step: global step + boundaries: List of steps to transition on. + values: Multiplier to apply at each boundary transition. + + Returns: + Scaled value for the learning rate. + """ + values = [1.0] + values + return tf.train.piecewise_constant( + step, boundaries, values, name="piecewise_lr") + + +def _learning_rate_decay(hparams, warmup_steps=0): + """Learning rate decay multiplier.""" + scheme = hparams.learning_rate_decay_scheme + warmup_steps = tf.to_float(warmup_steps) + global_step = tf.to_float(tf.train.get_or_create_global_step()) + + if not scheme or scheme == "none": + return tf.constant(1.) + + tf.logging.info("Applying learning rate decay: %s.", scheme) + + if scheme == "exp": + decay_steps = hparams.learning_rate_decay_steps + p = (global_step - warmup_steps) / decay_steps + if hparams.learning_rate_decay_staircase: + p = tf.floor(p) + return tf.pow(hparams.learning_rate_decay_rate, p) + + if scheme == "piecewise": + return _piecewise_learning_rate(global_step, + hparams.learning_rate_boundaries, + hparams.learning_rate_multiples) + + if scheme == "cosine": + cycle_steps = hparams.learning_rate_cosine_cycle_steps + cycle_position = global_step % (2 * cycle_steps) + cycle_position = cycle_steps - tf.abs(cycle_steps - cycle_position) + return 0.5 * (1 + tf.cos(np.pi * cycle_position / cycle_steps)) + + if scheme == "cyclelinear10x": + # Cycle the rate linearly by 10x every warmup_steps, up and down. + cycle_steps = warmup_steps + cycle_position = global_step % (2 * cycle_steps) + cycle_position = tf.to_float( # Normalize to the interval [-1, 1]. + cycle_position - cycle_steps) / float(cycle_steps) + cycle_position = 1.0 - tf.abs(cycle_position) # 0 to 1 and back to 0. + return (cycle_position + 0.1) * 3.0 # 10x difference each cycle (0.3-3). + + if scheme == "sqrt": + return _legacy_sqrt_decay(global_step - warmup_steps) + + raise ValueError("Unrecognized learning rate decay scheme: %s" % + hparams.learning_rate_decay_scheme) + + +def _learning_rate_warmup(warmup_steps, warmup_schedule="exp"): + """Learning rate warmup multiplier.""" + if not warmup_steps: + return tf.constant(1.) + + tf.logging.info("Applying %s learning rate warmup for %d steps", + warmup_schedule, warmup_steps) + + warmup_steps = tf.to_float(warmup_steps) + global_step = tf.to_float(tf.train.get_or_create_global_step()) + + if warmup_schedule == "exp": + return tf.exp(tf.log(0.01) / warmup_steps)**(warmup_steps - global_step) + else: + assert warmup_schedule == "linear" + start = tf.constant(0.35) + return ((tf.constant(1.) - start) / warmup_steps) * global_step + start diff --git a/tensor2tensor/utils/optimize.py b/tensor2tensor/utils/optimize.py index 6b303d7d0..3d5526535 100644 --- a/tensor2tensor/utils/optimize.py +++ b/tensor2tensor/utils/optimize.py @@ -22,6 +22,7 @@ import numpy as np +from tensor2tensor.utils import adafactor from tensor2tensor.utils import yellowfin import tensorflow as tf @@ -82,7 +83,7 @@ def __init__(self, optimizer_name, lr, hparams, use_tpu=False): # We change the default epsilon for Adam and re-scale lr. # Using LazyAdam as it's much faster for large vocabulary embeddings. self._opt = tf.contrib.opt.LazyAdamOptimizer( - lr / 500.0, + lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) @@ -96,12 +97,12 @@ def __init__(self, optimizer_name, lr, hparams, use_tpu=False): learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) elif optimizer_name == "TrueAdam": self._opt = tf.train.AdamOptimizer( - lr / 500.0, + lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Adafactor": - self._opt = AdafactorOptimizer(lr / 500.0) + self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr) else: self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr) @@ -113,130 +114,6 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None): grads_and_vars, global_step=global_step, name=name) -def _sqrt_decay(step): - """Decay like 1 / sqrt(step), multiplied by 500 to normalize.""" - return 500.0 / tf.sqrt(tf.maximum(step, 1.0)) - - -def _exp_decay_after(step, rate, from_which_step): - """Decay exponentially by rate (per step) starting at from_which_step.""" - return tf.cond( - step < from_which_step, - lambda: tf.constant(1.0), - lambda: rate**(step - from_which_step), - name="exponential_decay_step_cond") - - -def piecewise_learning_rate(step, boundaries, values): - """Scale learning rate according to the given schedule. - - Multipliers are not cumulative. - - Args: - step: global step - boundaries: List of steps to transition on. - values: Multiplier to apply at each boundary transition. - - Returns: - Scaled value for the learning rate. - """ - values = [1.0] + values - return tf.train.piecewise_constant( - step, boundaries, values, name="piecewise_lr") - - -def learning_rate_decay(hparams, warmup_steps=0): - """Learning rate decay multiplier.""" - scheme = hparams.learning_rate_decay_scheme - warmup_steps = tf.to_float(warmup_steps) - global_step = tf.to_float(tf.train.get_or_create_global_step()) - - if not scheme or scheme == "none": - return tf.constant(1.) - - tf.logging.info("Applying learning rate decay: %s.", scheme) - - if scheme == "exp": - decay_steps = hparams.learning_rate_decay_steps - p = (global_step - warmup_steps) / decay_steps - if hparams.learning_rate_decay_staircase: - p = tf.floor(p) - return tf.pow(hparams.learning_rate_decay_rate, p) - - if scheme == "piecewise": - return piecewise_learning_rate(global_step, - hparams.learning_rate_boundaries, - hparams.learning_rate_multiples) - - if scheme == "cosine": - cycle_steps = hparams.learning_rate_cosine_cycle_steps - cycle_position = global_step % (2 * cycle_steps) - cycle_position = cycle_steps - tf.abs(cycle_steps - cycle_position) - return 0.5 * (1 + tf.cos(np.pi * cycle_position / cycle_steps)) - - if scheme == "cyclelinear10x": - # Cycle the rate linearly by 10x every warmup_steps, up and down. - cycle_steps = warmup_steps - cycle_position = global_step % (2 * cycle_steps) - cycle_position = tf.to_float( # Normalize to the interval [-1, 1]. - cycle_position - cycle_steps) / float(cycle_steps) - cycle_position = 1.0 - tf.abs(cycle_position) # 0 to 1 and back to 0. - return (cycle_position + 0.1) * 3.0 # 10x difference each cycle (0.3-3). - - if scheme == "sqrt": - return _sqrt_decay(global_step - warmup_steps) - - raise ValueError("Unrecognized learning rate decay scheme: %s" % - hparams.learning_rate_decay_scheme) - - -def learning_rate_warmup(warmup_steps, warmup_schedule="exp"): - """Learning rate warmup multiplier.""" - if not warmup_steps: - return tf.constant(1.) - - tf.logging.info("Applying %s learning rate warmup for %d steps", - warmup_schedule, warmup_steps) - - warmup_steps = tf.to_float(warmup_steps) - global_step = tf.to_float(tf.train.get_or_create_global_step()) - - if warmup_schedule == "exp": - return tf.exp(tf.log(0.01) / warmup_steps)**(warmup_steps - global_step) - else: - assert warmup_schedule == "linear" - start = tf.constant(0.35) - return ((tf.constant(1.) - start) / warmup_steps) * global_step + start - - -def learning_rate_decay_with_warmup(hparams, num_worker_replicas=1): - """Learning rate decay rate with warmup based on hparams.""" - warmup_steps = hparams.learning_rate_warmup_steps * num_worker_replicas - warmup = learning_rate_warmup(warmup_steps) - - decay = learning_rate_decay(hparams, warmup_steps) - - global_step = tf.train.get_or_create_global_step() - return tf.where(global_step < warmup_steps, warmup, decay) - - -def learning_rate_schedule(hparams, num_worker_replicas=1): - """Learning rate schedule based on hparams.""" - schedule = hparams.learning_rate_schedule - warmup_steps = tf.to_float(hparams.learning_rate_warmup_steps) - global_step = tf.to_float(tf.train.get_or_create_global_step()) - if hparams.learning_rate_decay_scheme == "noam": - # backwards compatiblity with previous behavior - schedule = "linear_warmup_rsqrt_decay" - if schedule == "warmup_and_decay": - return learning_rate_decay_with_warmup(hparams, num_worker_replicas) - elif schedule == "linear_warmup_rsqrt_decay": - return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum( - (global_step + 1) * warmup_steps**-1.5, (global_step + 1)**-0.5) - else: - raise ValueError("Unrecognized learning rate schedule: %s" % schedule) - - def weight_decay_and_noise(loss, hparams, learning_rate, var_list=None): """Apply weight decay and weight noise.""" if var_list is None: @@ -347,231 +224,3 @@ def get_variable_initializer(hparams): else: raise ValueError("Unrecognized initializer: %s" % hparams.initializer) - -class AdafactorOptimizer(tf.train.Optimizer): - """Optimizer that implements the Adafactor algorithm. - - Adafactor is similar to RMSProp (ADAM, etc.), but takes advantage of the - structure of weight matrices to use less memory and to be more resilient to - sudden large gradients. - - The RMSProp algorithm works on each component independently as follows: - w -= grad * learning_rate / sqrt(estimated_mean_square_grad) - - learning_rate is the desired update magnitude, and - estimated_mean_square_grad is computed by exponential smoothing of the - square of the gradient. - - Adafactor addresses two shortcomings of RMSProp: - - 1. In RMSProp (ADAM, etc), maintaining estimated_mean_square_grad requires - memory equal to the number of parameters. This can be an impediment to - training large models on GPU/TPU systems with limited memory. - - Adafactor uses less memory. - For an AxB weight matrix, instead of keeping a full AxB - estimated_mean_square_grad matrix, Adafactor keeps only - exponentially-smoothed row and column means, and bases its estimates on - those means. Thus the memory requirements drop from `2AB` to `A+B`. - - 2. Depending on the decay rate of the exponential smoothing, we run into one - of two problems. - - If the decay rate is high (short memory), we see the problem described - here - worse final quality: - On the Convergence of Adam and Beyond - https://openreview.net/forum?id=ryQu7f-RZ - - If the decay rate is low (long memory), then the estimate does not adjust - rapidly to suddenly large gradients, and the model diverges. - Suddenly large gradients (which we will call anomalies), may happen either - due to weird training data, or because the model has just learned something - important and can now rush to exploit it. Momentum (as in ADAM) can help - prevent divergence, but it also requires more memory. Gradient clipping - can also help prevent divergence, but it is irritating in that setting - the right threshold depends on the knowing the scale of the gradients. - - Adafactor uses a relatively long memory (setting the decay rate to - step_num^-0.8), but detects and corrects for anomalies. An anomaly - is detected if the mean-square gradient for the current step - (across the entire weight matrix) is much greater than the historical - average. When this occurs, we increase estimated_mean_square_grad - for the current step for all weights in the matrix. Note: it is important - to detect anomalies based on entire matrices, rather than individual - weights, since any individual weight may legitimately have a pattern - of many small gradients and occasional very large ones. - - HYPERPARAMETERS: - learning_rate: desired magnitude of variable updates. a scalar - can be a - constant, but more likely should have a warmup and then decay - proportionally to rsqrt(step_num) - epsilon: 1e-20 - a small floating point value to avoid division by zero. - horizon_exponent: 0.8 - a value between 0 and 1 - The effective decay - horizon of the second-moment estimator is step_num^horizon_exponent. - anomaly_threshold: 2.0 - a value greater than 1. Suppress anomalies - where the mean-square-gradients for a step exceed the long-term average - by at least this factor. - - ALGORITHM: - - We initialize - ``` - t <- 0 - if var is 2-dimensional: - v_r <- zeros([num_rows]) - v_c <- zeros([num_cols]) - else: - v <- zeros(shape(var)) - ``` - - The update rule is as follows: - ``` - t <- t + 1 - decay_rate = 1 - t ^ (-horizon_exponent) - grad_squared = tf.square(grad) + epsilon - if var is 2-dimensional: - v_r <- decay_rate * v_r + (1 - decay_rate) * reduce_mean(grad_squared, 1) - v_c <- decay_rate * v_c + (1 - decay_rate) * reduce_mean(grad_squared, 0) - anomaly_factor = max(1.0, - reduce_mean(grad_squared) / reduce_mean(v_r) / anomaly_threshold) - est_v = anomaly_factor * outer_prod(v_r, v_c) / reduce_mean(v_r) - else: - v <- decay_rate * v + (1 - decay_rate) * grad_squared - anomaly_factor = max(1.0, - reduce_mean(grad_squared) / reduce_mean(v) / anomaly_threshold) - est_v = v * anomaly_factor - var <- var - lr * grad / sqrt(est_v) - ``` - TODO(noam): write a paper. - TODO(noam): we should also apply the 2d logic to the two final dimensions. - of >2d convolutional kernels. - """ - - def __init__(self, - learning_rate=0.001, - epsilon=1e-20, - horizon_exponent=0.8, - anomaly_threshold=2.0, - use_locking=False, - name="Adafactor"): - """Construct a new Adafactor optimizer. - - See class comment. - - Args: - learning_rate: A Tensor or a floating point value. The learning rate. - epsilon: A small constant for numerical stability. - horizon_exponent: a floating point value between 0 and 1 - anomaly_threshold: a floating point value >= 1.0 - use_locking: If True use locks for update operations. - name: Optional name for the operations created when applying gradients. - Defaults to "AdafactorOptimizer". - """ - super(AdafactorOptimizer, self).__init__(use_locking, name) - self._lr = learning_rate - self._epsilon = epsilon - self._horizon_exponent = horizon_exponent - self._anomaly_threshold = anomaly_threshold - - def _should_use_factored_second_moment_estimate(self, shape): - """Should we use a factored second moment estimator. - - Based on the shape of the variable. - - Args: - shape: a list of integers - Returns: - a boolean - """ - return len(shape) == 2 - - def _create_slots(self, var_list): - for v in var_list: - shape = v.get_shape().as_list() - if self._should_use_factored_second_moment_estimate(shape): - r_val = tf.zeros([shape[0]], dtype=tf.float32) - c_val = tf.zeros([shape[1]], dtype=tf.float32) - self._get_or_make_slot(v, r_val, "vr", self._name) - self._get_or_make_slot(v, c_val, "vc", self._name) - else: - self._zeros_slot(v, "v", self._name) - - def _apply_dense(self, grad, var): - return self._resource_apply_dense(grad, var) - - def _resource_apply_dense(self, grad, var): - grad_squared = tf.square(grad) + self._epsilon - grad_squared_mean = tf.reduce_mean(grad_squared) - lr = tf.to_float(self._lr) - global_step = tf.to_float(tf.train.get_or_create_global_step()) + 1.0 - # HACK: Make lr and global_step dependent on grad. - # This confounds the XLA rewriter and keeps it from fusing computations - # across different variables. This fusion is a bad for HBM usage, since - # it causes the gradients to persist in memory. - lr += grad_squared_mean * 1e-30 - global_step += grad_squared_mean * 1e-30 - # END HACK - mixing_rate = tf.pow(global_step, -self._horizon_exponent) - decay_rate = 1.0 - mixing_rate - shape = var.get_shape().as_list() - updates = [] - if self._should_use_factored_second_moment_estimate(shape): - grad_squared_row_mean = tf.reduce_mean(grad_squared, 1) - grad_squared_col_mean = tf.reduce_mean(grad_squared, 0) - vr = self.get_slot(var, "vr") - new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) - vc = self.get_slot(var, "vc") - new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) - vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking) - vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking) - updates = [vr_update, vc_update] - long_term_mean = tf.reduce_mean(new_vr) - anomaly_factor = self._anomaly_factor(grad_squared_mean, long_term_mean) - # This is the computation we should do. - # est_v = (tf.expand_dims(new_vr, 1) * tf.expand_dims(new_vc, 0) - # * anomaly_factor / long_term_mean) - # subtrahend = grad * lr / tf.sqrt(est_v) - # Instead we do the following, which is mathematically equivalent. - r_factor = lr * tf.rsqrt(new_vr * anomaly_factor / long_term_mean) - c_factor = tf.rsqrt(new_vc) - subtrahend = ( - grad * tf.expand_dims(r_factor, 1) * tf.expand_dims(c_factor, 0)) - else: - v = self.get_slot(var, "v") - new_v = decay_rate * v + mixing_rate * grad_squared - v_update = tf.assign(v, new_v, use_locking=self._use_locking) - updates = [v_update] - long_term_mean = tf.reduce_mean(new_v) - anomaly_factor = self._anomaly_factor(grad_squared_mean, long_term_mean) - # This is the computation we should do. - # est_v = (new_v * anomaly_factor) - # subtrahend = grad * lr / tf.sqrt(est_v) - # Instead we do the following, which is mathematically equivalent. - subtrahend = grad * (lr / tf.sqrt(anomaly_factor)) * tf.rsqrt(new_v) - var_update = tf.assign_sub(var, subtrahend, use_locking=self._use_locking) - updates = [var_update] + updates - return tf.group(*updates) - - def _anomaly_factor(self, grad_squared_mean, long_term_mean): - """Multiplier for second-moment estimator, due to short-term anomalies. - - A step may have gradients with magnitudes much larger than the long-term - average. This can cause the model to diverge. In these cases, we want to - temoporarily increase the second-moment estimators to reflect that these - steps are anomalous. - - It is important to make these calculations on whole weight matrices, rather - than on individual parameters, since we want to allow individual parameters - to have occasional large updates. - - Args: - grad_squared_mean: A scalar. The mean square gradient on the varaible - for the current step. - long_term_mean: A scalar. The mean of the long-term second-moment - estimator. - Returns: - a scalar that should be multiplied into the second-moment-estimator for - this step. - """ - ratio = grad_squared_mean / long_term_mean - return tf.maximum(1.0, ratio / self._anomaly_threshold) diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 1b4013fbc..085cc821f 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -34,6 +34,7 @@ from tensor2tensor.utils import beam_search from tensor2tensor.utils import decoding from tensor2tensor.utils import expert_utils as eu +from tensor2tensor.utils import learning_rate from tensor2tensor.utils import metrics from tensor2tensor.utils import optimize from tensor2tensor.utils import registry @@ -238,6 +239,9 @@ def bottom(self, features): # Transform the input features for key, input_modality in six.iteritems( self._problem_hparams.input_modality): + if key not in features: + tf.logging.warning("Missing feature %s - ignoring." % key) + continue do_reuse = input_modality.name in all_previous_modalities with tf.variable_scope(input_modality.name, reuse=do_reuse): log_info("Transforming feature '%s' with %s.bottom", key, @@ -336,13 +340,7 @@ def loss(self, logits, features): def optimize(self, loss, num_async_replicas=1): """Return a training op minimizing loss.""" log_info("Base learning rate: %f", self.hparams.learning_rate) - lr = self.hparams.learning_rate - decay_rate = optimize.learning_rate_schedule(self.hparams) - lr *= decay_rate - if self.hparams.learning_rate_minimum: - lr_min = float(self.hparams.learning_rate_minimum) - log_info("Applying learning rate minimum: %f", lr_min) - lr = tf.max(lr, tf.to_float(lr_min)) + lr = learning_rate.learning_rate_schedule(self.hparams) if num_async_replicas > 1: log_info("Dividing learning rate by num_async_replicas: %d", num_async_replicas)