diff --git a/setup.py b/setup.py index c62b3409c..5beeb1b3e 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.1.6', + version='1.1.7', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer index 13dd7d355..7c7b48932 100644 --- a/tensor2tensor/bin/t2t-trainer +++ b/tensor2tensor/bin/t2t-trainer @@ -30,6 +30,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import os + # Dependency imports from tensor2tensor.utils import registry @@ -57,22 +59,25 @@ def main(_): usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) trainer_utils.log_registry() trainer_utils.validate_flags() - tf.gfile.MakeDirs(FLAGS.output_dir) + output_dir = os.path.expanduser(FLAGS.output_dir) + tmp_dir = os.path.expanduser(FLAGS.tmp_dir) + data_dir = os.path.expanduser(FLAGS.data_dir) + tf.gfile.MakeDirs(output_dir) # Generate data if requested. if FLAGS.generate_data: - tf.gfile.MakeDirs(FLAGS.data_dir) - tf.gfile.MakeDirs(FLAGS.tmp_dir) + tf.gfile.MakeDirs(data_dir) + tf.gfile.MakeDirs(tmp_dir) for problem_name in FLAGS.problems.split("-"): tf.logging.info("Generating data for %s" % problem_name) problem = registry.problem(problem_name) - problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir) + problem.generate_data(data_dir, tmp_dir) # Run the trainer. trainer_utils.run( - data_dir=FLAGS.data_dir, + data_dir=data_dir, model=FLAGS.model, - output_dir=FLAGS.output_dir, + output_dir=output_dir, train_steps=FLAGS.train_steps, eval_steps=FLAGS.eval_steps, schedule=FLAGS.schedule) diff --git a/tensor2tensor/data_generators/cipher.py b/tensor2tensor/data_generators/cipher.py index 3a743337a..41dcbd80e 100644 --- a/tensor2tensor/data_generators/cipher.py +++ b/tensor2tensor/data_generators/cipher.py @@ -56,8 +56,8 @@ def _gen(nbr_symbols, max_length, nbr_cases): for plain, code in zip(indices, codes): yield { - "X": plain, - "Y": code, + "inputs": plain, + "targets": code, } return _gen @@ -99,8 +99,8 @@ def _gen(nbr_symbols, max_length, nbr_cases): for plain, code in zip(indices, codes): yield { - "X": plain, - "Y": code, + "inputs": plain, + "targets": code, } return _gen @@ -148,7 +148,7 @@ def key(self): return [1, 3] -class Layer(object): +class ShiftEncryptionLayer(object): """A single layer for shift.""" def __init__(self, vocab, shift): @@ -211,7 +211,7 @@ def encipher_shift(plaintext, plain_vocab, shift): ciphertext (list of Strings): encrypted plain text. """ ciphertext = [] - cipher = Layer(plain_vocab, shift) + cipher = ShiftEncryptionLayer(plain_vocab, shift) for _, sentence in enumerate(plaintext): cipher_sentence = [] @@ -238,7 +238,7 @@ def encipher_vigenere(plaintext, plain_vocab, key): # generate Vigenere table layers = [] for i in range(len(plain_vocab)): - layers.append(Layer(plain_vocab, i)) + layers.append(ShiftEncryptionLayer(plain_vocab, i)) for i, sentence in enumerate(plaintext): cipher_sentence = [] diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py deleted file mode 100644 index 9d7678fc4..000000000 --- a/tensor2tensor/data_generators/concatenate_examples.py +++ /dev/null @@ -1,180 +0,0 @@ -# coding=utf-8 -# Copyright 2017 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r"""Convert seq-seq examples to "concatenated" examples. - -The concatenated example has no "inputs". -Instead the source is at the beginning of the target. - -We can now use a simple language model. - -Example: -seq-seq mode: -{ - "inputs": subtokenizer.encode("I love you.") + [1] - "targets": subtokenizer.encode("Je t'aime.") + [1] -} --> -concatenated mode: -{ - "inputs": [0] - "targets": (subtokenizer.encode("source English I love you.") + [1] - + subtokenizer.encode("target French Je t'aime.") + [1]) -} - -We add a dummy feature "inputs"=[0] for compatibility with seq-to-seq models. - -If FLAGS.combine_to_length is nonzero, then we combine multiple examples into -examples of a constant length, possibly with some padding at the end. - -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import random - -# Dependency imports - -from tensor2tensor.data_generators import generator_utils -from tensor2tensor.data_generators import text_encoder -import tensorflow as tf - -tf.flags.DEFINE_string("vocab_file", "", "SubwordTextEncoder vocabulary file") - -tf.flags.DEFINE_boolean( - "random_reverse", False, - "If true, write half of the example with source/target reversed") - -tf.flags.DEFINE_boolean( - "count_everything", False, - "If true, assign positive weights to designators, source and target. " - "If false, assign positive weights only to target.") - -tf.flags.DEFINE_string("source_domain_string", "English", "") -tf.flags.DEFINE_string("target_domain_string", "French", "") - -tf.flags.DEFINE_integer( - "combine_to_length", 0, - "If positive, concatenate examples to form examples with target length " - " equal to this value. Targets are padded with subtoken id=0.") - -tf.flags.DEFINE_string("in_file", "", "input filename") - -tf.flags.DEFINE_string( - "out_prefix", "/usr/local/google/tmp/concat", - "The output filename is equal to out_prefix plus " - "the last 15 characters of in_file. (e.g. -00001-of-00100)") - -FLAGS = tf.flags.FLAGS - - -def _make_example(ids, weights, raw_num_bytes): - if FLAGS.combine_to_length > 0: - ids += [0] * (FLAGS.combine_to_length - len(ids)) - return generator_utils.to_example({ - "targets": ids, - "target_weights": weights, - "inputs": [0], - "raw_num_bytes": [raw_num_bytes] - }).SerializeToString() - - -def main(_): - """Convert a file to examples.""" - subtokenizer = text_encoder.SubwordTextEncoder(FLAGS.vocab_file) - total_bytes = 0 - total_subtokens = 0 - total_examples = 0 - dropped_examples = 0 - - combined_subtokens = [] - combined_num_bytes = 0 - combined_weights = [] - - source_specifier = subtokenizer.encode("source " + FLAGS.source_domain_string) - target_specifier = subtokenizer.encode("target " + FLAGS.target_domain_string) - if FLAGS.random_reverse: - r_source_specifier = subtokenizer.encode("source " + - FLAGS.target_domain_string) - r_target_specifier = subtokenizer.encode("target " + - FLAGS.source_domain_string) - - reader = tf.python_io.tf_record_iterator(FLAGS.in_file) - - out_file = FLAGS.out_prefix + FLAGS.in_file[-15:] - writer = tf.python_io.TFRecordWriter(out_file) - - for record in reader: - total_examples += 1 - if total_examples % 1000 == 0: - tf.logging.info("total_examples: %d", total_examples) - x = tf.train.Example() - x.ParseFromString(record) - inputs = [i for i in x.features.feature["inputs"].int64_list.value] - targets = [i for i in x.features.feature["targets"].int64_list.value] - should_reverse = FLAGS.random_reverse and random.random() < 0.5 - source_bytes = len(subtokenizer.decode(inputs[:-1])) + 1 - target_bytes = len(subtokenizer.decode(targets[:-1])) + 1 - if not should_reverse: - subtokens = source_specifier + inputs + target_specifier + targets - weights = ([0.0] * - (len(source_specifier) + len(inputs) + len(target_specifier)) + - [1.0] * len(targets)) - num_bytes = target_bytes - else: - subtokens = r_source_specifier + targets + r_target_specifier + inputs - weights = ( - [0.0] * - (len(r_source_specifier) + len(targets) + len(r_target_specifier)) + - [1.0] * len(inputs)) - num_bytes = source_bytes - if FLAGS.count_everything: - weights = [1.0] * len(subtokens) - num_bytes = source_bytes + target_bytes - total_bytes += num_bytes - total_subtokens += sum(weights) - if FLAGS.combine_to_length: - if combined_subtokens and (len(combined_subtokens) + len(subtokens) > - FLAGS.combine_to_length): - writer.write( - _make_example(combined_subtokens, combined_weights, - combined_num_bytes)) - combined_subtokens = [] - combined_weights = [] - combined_num_bytes = 0 - if len(subtokens) <= FLAGS.combine_to_length: - combined_subtokens.extend(subtokens) - combined_weights.extend(weights) - combined_num_bytes += num_bytes - else: - dropped_examples += 1 - else: - writer.write(_make_example(subtokens, weights, num_bytes)) - if combined_subtokens: - writer.write( - _make_example(combined_subtokens, combined_weights, combined_num_bytes)) - writer.close() - - tf.logging.info("total bytes: %d", total_bytes) - tf.logging.info("total subtokens: %d", total_subtokens) - tf.logging.info("bytes per subtoken: %f", total_bytes / total_subtokens) - tf.logging.info("total documents: %d", total_examples) - tf.logging.info("dropped documents: %d", dropped_examples) - - -if __name__ == "__main__": - tf.app.run() diff --git a/tensor2tensor/data_generators/desc2code.py b/tensor2tensor/data_generators/desc2code.py index 98c93aacd..6cef6db63 100644 --- a/tensor2tensor/data_generators/desc2code.py +++ b/tensor2tensor/data_generators/desc2code.py @@ -22,6 +22,7 @@ import collections import os import random +import re import zipfile # Dependency imports @@ -42,10 +43,32 @@ _DATASET_PB_PATH = "description2code_current/" _DESC_DIR_NAME = "description" -_CODE_PY_DIR_NAME = "solutions_python" _VOCAB_EN_FILENAME = "vocab.endefr" -_VOCAB_PY_FILENAME = "vocab.py" + +_RE_CPP_INLINE_COMMENT = re.compile("//.*?\n") # Compiled once + + +# Constant defined for a language problem +CodingPbConstants = collections.namedtuple("CodingPbConstants", [ + "code_dir_name", + "vocab_filename", + "filter_patterns", + "target_space", +]) + +PB_PY = CodingPbConstants( + code_dir_name="solutions_python", + vocab_filename="vocab.py", + filter_patterns=["#include", "# include", "import java."], + target_space=problem.SpaceID.PY_TOK, +) +PB_CPP = CodingPbConstants( + code_dir_name="solutions_c++", + vocab_filename="vocab.cpp", + filter_patterns=["import java."], + target_space=problem.SpaceID.CPP_TOK, +) # Struct containing a coding problem (contains the paths to the descriptions # and code files) @@ -67,6 +90,14 @@ def num_shards(self): def use_subword_tokenizer(self): return True + @property + def input_space_id(self): + return problem.SpaceID.EN_TOK + + @property + def target_space_id(self): + return self.pb_constants.target_space + @property def input_vocab_size(self): return 2**15 # 32k @@ -81,7 +112,21 @@ def vocab_input_filename(self): @property def vocab_target_filename(self): - return "{}.{}".format(_VOCAB_PY_FILENAME, self.target_vocab_size) + return "{}.{}".format( + self.pb_constants.vocab_filename, self.target_vocab_size) + + def preprocess_target(self, target): + """Apply some preprocessing to the target. + + For instance, remove space/tabs. + + Args: + target (str): code source content + + Returns: + the pre-processed string content + """ + return target def feature_encoders(self, data_dir): source_vocab_filename = os.path.join(data_dir, self.vocab_input_filename) @@ -93,24 +138,11 @@ def feature_encoders(self, data_dir): "targets": target_token, } - -@registry.register_problem("desc2code_py") -class Desc2CodePyProblem(Desc2CodeProblem): - """Description2Code for python problem.""" - - @property - def input_space_id(self): - return problem.SpaceID.EN_TOK - - @property - def target_space_id(self): - return problem.SpaceID.PY_TOK - def train_generator(self, data_dir, tmp_dir, train): # Called twice: for train and test # Get the list of the training samples (coding challenge samples) - samples = list(generator_samples(tmp_dir)) + samples = list(generator_samples(tmp_dir, self.pb_constants)) # Split between train and dev # Suffle to get problems from diverse sources (CodeChef and CodeForces) and @@ -145,6 +177,7 @@ def generator_samples_content(get_source, get_target): for code_file in sample.code_files: with tf.gfile.GFile(code_file, mode="r") as target_file: target = target_file.read() + target = self.preprocess_target(target) yield source, target elif sample.code_files: # Only take the source if a target exists yield source, target @@ -176,16 +209,47 @@ def generator_target(): } +@registry.register_problem("desc2code_py") +class Desc2CodePyProblem(Desc2CodeProblem): + """Description2Code for python problem.""" + + @property + def pb_constants(self): + return PB_PY + + def preprocess_target(self, target): + """Simple tab to space replacement.""" + return target.replace("\t", " ") + + +@registry.register_problem("desc2code_cpp") +class Desc2CodeCppProblem(Desc2CodeProblem): + """Description2Code for C++ problem.""" + + @property + def pb_constants(self): + return PB_CPP + + def preprocess_target(self, target): + """Pre-process Cpp files.""" + target = re.sub(_RE_CPP_INLINE_COMMENT, " ", target) # Remove comments + # The regex rule is quite simple, So will fail if a // is inside a string, + # and don't remove /* */ comments + target = " ".join(target.split()) # Normalize all spaces + return target + + # Utils functions -def generator_samples(tmp_dir): +def generator_samples(tmp_dir, pb_cst): """Generator for the dataset samples. If not present, download and extract the dataset. Args: tmp_dir: path to the directory where to download the dataset. + pb_cst: CodingPbConstants object defining paths Yields: A CodingPbInfo object containing the next challenge informations. @@ -215,7 +279,7 @@ def contains_samples(subdir, dirs, files): # pylint: disable=unused-argument """Check that the folder contains a problem.""" return ( _DESC_DIR_NAME in dirs and - _CODE_PY_DIR_NAME in dirs + pb_cst.code_dir_name in dirs ) def next_sample(subdir, dirs, files): # pylint: disable=unused-argument @@ -224,10 +288,18 @@ def next_sample(subdir, dirs, files): # pylint: disable=unused-argument # pairs, the problem difficulty, the names of the algorithmic techniques # needed) desc_file = os.path.join(subdir, _DESC_DIR_NAME, "description.txt") - code_rootdir = os.path.join(subdir, _CODE_PY_DIR_NAME) - code_files = [ - f for f in tf.gfile.Glob(os.path.join(code_rootdir, "*.txt")) - ] + code_files = [] + # As the dataset is noisy, the program deduce the language from the file + # content. + code_pattern = os.path.join(subdir, pb_cst.code_dir_name, "*.txt") + for f in tf.gfile.Glob(code_pattern): + with tf.gfile.GFile(f, mode="r") as target_file: + # Hack to filter C++/Java files. In theory some python comments could + # make the file be concidered as C++ but in practice the chance of + # getting a false negative is low. + content = target_file.read() + if not any(p in content for p in pb_cst.filter_patterns): + code_files.append(f) return CodingPbInfo( desc_file=desc_file, code_files=code_files @@ -239,4 +311,3 @@ def next_sample(subdir, dirs, files): # pylint: disable=unused-argument for w in tf.gfile.Walk(data_rootdir): if contains_samples(*w): yield next_sample(*w) - diff --git a/tensor2tensor/data_generators/desc2code_test.py b/tensor2tensor/data_generators/desc2code_test.py new file mode 100644 index 000000000..24b7568d0 --- /dev/null +++ b/tensor2tensor/data_generators/desc2code_test.py @@ -0,0 +1,63 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for desc2code.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports +from tensor2tensor.data_generators import desc2code + +import tensorflow as tf + +CODE_CPP_IN = """ + #include + +void main() { // This comment will be removed + // This too. + // + /* Not this one */ +\t +\t + int a \t\n = 3;// +// +} + +""" + +CODE_CPP_OUT = ("#include void main() { /* Not this one */ int a = " + "3; }") + + +class Desc2codeTest(tf.test.TestCase): + + def testCppPreprocess(self): + """Check that the file correctly preprocess the code source.""" + cpp_pb = desc2code.Desc2CodeCppProblem() + + self.assertEqual( # Add space beween two lines + cpp_pb.preprocess_target("firstline//comm1\nsecondline//comm2\n"), + "firstline secondline") + # Checking for boths comments and spaces + self.assertEqual(cpp_pb.preprocess_target(CODE_CPP_IN), CODE_CPP_OUT) + self.assertEqual( + cpp_pb.preprocess_target(" not removed //abcd "), + "not removed //abcd") + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py index d314cec59..341a20c71 100644 --- a/tensor2tensor/data_generators/gene_expression.py +++ b/tensor2tensor/data_generators/gene_expression.py @@ -109,10 +109,10 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): # Collect created shard processes to start and join processes = [] - datasets = [ - (self.training_filepaths, self.num_shards, "train", num_train_examples), - (self.dev_filepaths, 10, "valid", num_dev_examples), - (self.test_filepaths, 10, "test", num_test_examples)] + datasets = [(self.training_filepaths, self.num_shards, "train", + num_train_examples), (self.dev_filepaths, 10, "valid", + num_dev_examples), + (self.test_filepaths, 10, "test", num_test_examples)] for fname_fn, nshards, key_prefix, num_examples in datasets: outfiles = fname_fn(data_dir, nshards, shuffled=False) all_filepaths.extend(outfiles) @@ -146,17 +146,14 @@ def hparams(self, defaults, model_hparams): p = defaults vocab_size = self._encoders["inputs"].vocab_size p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)} - p.target_modality = ("%s:real" % registry.Modalities.GENERIC, + p.target_modality = ("%s:log_poisson_loss" % registry.Modalities.REAL, self.num_output_predictions) p.input_space_id = problem.SpaceID.DNA p.target_space_id = problem.SpaceID.REAL def example_reading_spec(self): - # TODO(rsepassi): propagate and apply targets_mask to output RealModality - # and to eval metrics (weights_fn?). data_fields = { "inputs": tf.VarLenFeature(tf.int64), - "targets_mask": tf.VarLenFeature(tf.float32), "targets": tf.VarLenFeature(tf.float32), } data_items_to_decoders = None @@ -166,20 +163,17 @@ def preprocess_examples(self, examples, mode, hparams): del mode del hparams - # Reshape targets + # Reshape targets to contain num_output_predictions per output timestep examples["targets"] = tf.reshape(examples["targets"], - [-1, self.num_output_predictions]) - examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1]) - - # Set masked targets to 0 (i.e. pad) so that loss and metrics ignore them. - # Add epsilon because some unmasked labels are actually 0. - examples["targets"] += 1e-6 - examples["targets"] *= examples["targets_mask"] + [-1, 1, self.num_output_predictions]) + # Slice off EOS - not needed, and messes up the GeneExpressionConv model + # which expects the input length to be a multiple of the target length. + examples["inputs"] = examples["inputs"][:-1] return examples def eval_metrics(self): - return [metrics.Metrics.RMSE] + return [metrics.Metrics.LOG_POISSON, metrics.Metrics.R2] @registry.register_problem("gene_expression_cage10") @@ -260,7 +254,12 @@ def dataset_generator(filepath, if i % 100 == 0: print("Generating example %d for %s" % (i, dataset)) inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i] - yield to_example_dict(encoder, inputs, mask, outputs) + ex_dict = to_example_dict(encoder, inputs, mask, outputs) + # Original data has one output for every 128 input bases. Ensure that the + # ratio has been maintained given the chunk size and removing EOS. + assert (len(ex_dict["inputs"]) - 1) == (( + 128 // chunk_size) * ex_dict["targets_shape"][0]) + yield ex_dict def to_example_dict(encoder, inputs, mask, outputs): diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index fb7e53cb7..07fafb492 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -88,6 +88,20 @@ class SpaceID(object): PEPTIDE = 26 # Python PY_TOK = 27 + # C++ + CPP_TOK = 28 + + +def preprocess_examples_common(examples, hparams): + """Preprocessing steps common to all models.""" + if hparams.max_input_seq_length > 0: + examples["inputs"] = examples["inputs"][:hparams.max_input_seq_length] + if hparams.max_target_seq_length > 0: + examples["targets"] = examples["targets"][:hparams.max_target_seq_length] + if hparams.prepend_inputs_to_targets: + examples["targets"] = tf.concat( + [examples["inputs"], [0], examples["targets"]], 0) + return examples class Problem(object): @@ -170,11 +184,7 @@ def example_reading_spec(self): def preprocess_examples(self, examples, mode, hparams): del mode - if hparams.max_input_seq_length > 0: - examples["inputs"] = examples["inputs"][:hparams.max_input_seq_length] - if hparams.max_target_seq_length > 0: - examples["targets"] = examples["targets"][:hparams.max_target_seq_length] - return examples + return preprocess_examples_common(examples, hparams) def eval_metrics(self): return [ @@ -424,5 +434,6 @@ def eval_metrics(self): return [ metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY, - metrics.Metrics.APPROX_BLEU + metrics.Metrics.APPROX_BLEU, metrics.Metrics.ROUGE_2_F, + metrics.Metrics.ROUGE_L_F ] diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index a43afec47..2b1bd124f 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -166,17 +166,17 @@ def add_positional_embedding_nd(x, max_length, name): def embedding_to_padding(emb): - """Input embeddings -> is_padding. + """Calculates the padding mask based on which embeddings are all zero. We have hacked symbol_modality to return all-zero embeddings for padding. Args: emb: a Tensor with shape [..., depth]. Returns: - a boolean Tensor with shape [...]. + a float Tensor with shape [...]. """ emb_sum = tf.reduce_sum(tf.abs(emb), axis=-1) - return tf.equal(emb_sum, 0.0) + return tf.to_float(tf.equal(emb_sum, 0.0)) def attention_bias_lower_triangle(length): @@ -197,13 +197,13 @@ def attention_bias_ignore_padding(memory_padding): """Create an bias tensor to be added to attention logits. Args: - memory_padding: a boolean `Tensor` with shape [batch, memory_length]. + memory_padding: a float `Tensor` with shape [batch, memory_length]. Returns: a `Tensor` with shape [batch, 1, 1, memory_length]. """ - ret = tf.to_float(memory_padding) * -1e9 - return tf.expand_dims(tf.expand_dims(ret, 1), 1) + ret = memory_padding * -1e9 + return tf.expand_dims(tf.expand_dims(ret, axis=1), axis=1) def attention_bias_proximal(length): @@ -523,8 +523,7 @@ def pad_l_and_r(x, pad_length): # [batch, heads, blocks, block_length, dim] k_new = tf.transpose(k_new, [2, 3, 0, 1, 4]) - attention_bias = tf.expand_dims( - tf.to_float(embedding_to_padding(k_new)) * -1e9, axis=-2) + attention_bias = tf.expand_dims(embedding_to_padding(k_new) * -1e9, axis=-2) v_t = tf.transpose(v, [2, 0, 1, 3]) v_new = tf.gather(v_t, gather_indices) diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py index 6ecb06fb4..10b5e7e59 100644 --- a/tensor2tensor/layers/common_hparams.py +++ b/tensor2tensor/layers/common_hparams.py @@ -69,8 +69,23 @@ def basic_params1(): sampling_method="argmax", # "argmax" or "random" problem_choice="adaptive", # "uniform", "adaptive", "distributed" multiply_embedding_mode="sqrt_depth", + # Sequences of operations to perform on layer input and layer output. + # Used by common_layers.layer_preprocess, common_layers.layer_postprocess + # Each character repsesnts an operation: + # d: apply dropout + # n: apply normalization (see norm_type and norm_epsilon) + # a: add layer input (residual connection - only during postprocess) + # TODO(noam): The current settings ("", "dan") are the published version + # of the transformer. ("n", "da") seems better for harder-to-learn + # models, so it should probably be the default. + layer_preprocess_sequence="", + layer_postprocess_sequence="dan", + # dropout rate to use during layer_preprocess and layer_postprocess + layer_prepostprocess_dropout=0.1, + # What type of normalization to use norm_type="none", # "batch", layer", "noam", "none". - layer_norm_epsilon=1e-6, + # epsilon parameter to normalization function + norm_epsilon=1e-6, symbol_modality_num_shards=16, # setting the max length in a minibatch. 0 means default behavior, # max_length = hparams.batch_size * length_multiplier @@ -103,7 +118,13 @@ def basic_params1(): # mean there is no maximum or truncation. # You can change this behavior by overridding preprocess_examples() method # in your problem class. - max_target_seq_length=0) + max_target_seq_length=0, + # Treat a seq-to-seq problem as a language model by prepending the + # inputs to the targets. During training, the loss is on both the + # inputs and the targets. During eval, metrics are computed only on the + # target portion. + prepend_inputs_to_targets=int(False), + ) class RangedHParams(object): diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py index ea18322e4..a85430c1c 100644 --- a/tensor2tensor/layers/common_layers.py +++ b/tensor2tensor/layers/common_layers.py @@ -359,13 +359,23 @@ def conv2d_kernel(kernel_size_arg, name_suffix): return conv2d_kernel(kernel_size, "single") -def conv(inputs, filters, kernel_size, **kwargs): - return conv_internal(tf.layers.conv2d, inputs, filters, kernel_size, **kwargs) +def conv(inputs, filters, kernel_size, dilation_rate=1, **kwargs): + return conv_internal( + tf.layers.conv2d, + inputs, + filters, + kernel_size, + dilation_rate=dilation_rate, + **kwargs) -def conv1d(inputs, filters, kernel_size, **kwargs): +def conv1d(inputs, filters, kernel_size, dilation_rate=1, **kwargs): return tf.squeeze( - conv(tf.expand_dims(inputs, 2), filters, (kernel_size, 1), **kwargs), 2) + conv( + tf.expand_dims(inputs, 2), + filters, (kernel_size, 1), + dilation_rate=(dilation_rate, 1), + **kwargs), 2) def separable_conv(inputs, filters, kernel_size, **kwargs): @@ -452,64 +462,136 @@ def layer_norm(x, filters=None, epsilon=1e-6, name=None, reuse=None): return result -def noam_norm(x, name=None): +def noam_norm(x, epsilon=1.0, name=None): """One version of layer normalization.""" with tf.name_scope(name, default_name="noam_norm", values=[x]): shape = x.get_shape() ndims = len(shape) - return (tf.nn.l2_normalize(x, ndims - 1, epsilon=1.0) * + return (tf.nn.l2_normalize(x, ndims - 1, epsilon=epsilon) * tf.sqrt(tf.to_float(shape[-1]))) -def get_norm(norm_type): - """Get the normalizer function.""" +def apply_norm(x, norm_type, depth, epsilon): + """Apply Normalization.""" if norm_type == "layer": - return lambda x, name, filters=None, epsilon=1e-6: layer_norm( # pylint: disable=g-long-lambda - x, filters=filters, epsilon=epsilon, name=name) + return layer_norm(x, filters=depth, epsilon=epsilon) if norm_type == "batch": - return tf.layers.batch_normalization + return tf.layers.batch_normalization(x, epsilon=epsilon) if norm_type == "noam": - return noam_norm + return noam_norm(x, epsilon) if norm_type == "none": - return lambda x, name: x + return x raise ValueError("Parameter normalizer_fn must be one of: 'layer', 'batch'," "'noam', 'none'.") -def residual_fn(x, - y, - norm_type, - residual_dropout, - filters=None, - epsilon=1e-16, - name=None, - reuse=None): - """Returns a function for combining layer input and layer output. +def layer_prepostprocess(previous_value, + x, + sequence, + dropout_rate, + norm_type, + depth, + epsilon, + name): + """Apply a sequence of functions to the input or output of a layer. + + The sequence is specified as a string which may contain the following + characters: + a: add previous_value + n: apply normalization + d: apply dropout - The returned function on x (layer input) and y (layer output) computes: - norm_function(x + dropout(y)) + For example, if sequence=="dna", then the output is + previous_value + normalize(dropout(x)) Args: - x: tensor, input layer - y: tensor, output layer - norm_type: string, type of normalizer function - residual_dropout: integer, dropout value for residual connection - filters: integer, dimension for layer norm, optional - epsilon: integer, value of layer norm epsilon - name: string, name - reuse: bool, whether to reuse + previous_value: A Tensor, to be added as a residual connection ('a') + x: A Tensor to be transformed. + sequence: a string. + dropout_rate: a float + norm_type: a string (see apply_norm()) + depth: an integer (size of last dimension of x). + epsilon: a float (parameter for normalization) + name: a string Returns: - residual layer output with applied norm_fn. + a Tensor """ - with tf.variable_scope( - name, default_name="residual", values=[x, y], reuse=reuse): - norm_fn = get_norm(norm_type) - res = x + tf.nn.dropout(y, 1.0 - residual_dropout) - if norm_type == "layer": - return norm_fn(res, filters=filters, epsilon=epsilon, name=norm_type) - else: - return norm_fn(res, name=norm_type) + with tf.variable_scope(name): + for c in sequence: + if c == "a": + x += previous_value + elif c == "n": + x = apply_norm(x, norm_type, depth, epsilon) + else: + assert c == "d", ("Unknown sequence step %s" % c) + x = tf.nn.dropout(x, 1.0 - dropout_rate) + return x + + +def layer_preprocess(layer_input, hparams): + """Apply layer preprocessing. + + See layer_prepostprocess() for details. + + A hyperparemeters object is passed for convenience. The hyperparameters + that may be used are: + + layer_preprocess_sequence + layer_prepostprocess_dropout + norm_type + hidden_size + norm_epsilon + + Args: + layer_input: a Tensor + hparams: a hyperparameters object. + + Returns: + a Tensor + """ + assert "a" not in hparams.layer_preprocess_sequence, ( + "No residual connections allowed in hparams.layer_preprocess_sequence") + return layer_prepostprocess( + None, layer_input, + sequence=hparams.layer_preprocess_sequence, + dropout_rate=hparams.layer_prepostprocess_dropout, + norm_type=hparams.norm_type, + depth=hparams.hidden_size, + epsilon=hparams.norm_epsilon, + name="layer_prepostprocess") + + +def layer_postprocess(layer_input, layer_output, hparams): + """Apply layer postprocessing. + + See layer_prepostprocess() for details. + + A hyperparemeters object is passed for convenience. The hyperparameters + that may be used are: + + layer_postprocess_sequence + layer_prepostprocess_dropout + norm_type + hidden_size + norm_epsilon + + Args: + layer_input: a Tensor + layer_output: a Tensor + hparams: a hyperparameters object. + + Returns: + a Tensor + """ + return layer_prepostprocess( + layer_input, layer_output, + sequence=hparams.layer_postprocess_sequence, + dropout_rate=hparams.layer_prepostprocess_dropout, + norm_type=hparams.norm_type, + depth=hparams.hidden_size, + epsilon=hparams.norm_epsilon, + name="layer_postprocess") def conv_block_internal(conv_fn, @@ -1344,6 +1426,22 @@ def weights_nonzero(labels): return tf.to_float(tf.not_equal(labels, 0)) +def weights_second_part(labels): + """Weights function for 'prepend_inputs_to_targets'. + + Weight 1.0 is assigned to all nonzero labels past the first zero. + + Args: + labels: A Tensor of int32s. + + Returns: + A Tensor of floats. + """ + past_first_zero = tf.cumsum(tf.to_float(tf.equal(labels, 0))) + nonzero = tf.to_float(labels) + return tf.to_float(tf.not_equal(past_first_zero * nonzero, 0)) + + def weights_all(labels): """Assign weight 1.0 to all labels.""" return tf.ones_like(labels, dtype=tf.float32) diff --git a/tensor2tensor/layers/common_layers_test.py b/tensor2tensor/layers/common_layers_test.py index df3ccc68f..3cf3f3374 100644 --- a/tensor2tensor/layers/common_layers_test.py +++ b/tensor2tensor/layers/common_layers_test.py @@ -303,74 +303,43 @@ def testDeconvStride2MultiStep(self): actual = session.run(a) self.assertEqual(actual.shape, (5, 32, 1, 16)) - def testGetNormLayerFn(self): - norm_type = "layer" + def testApplyNormLayer(self): with self.test_session() as session: - a = common_layers.get_norm(norm_type) x1 = np.random.rand(5, 2, 1, 11) - x2 = a(tf.constant(x1, dtype=tf.float32), name="layer", filters=11) + x2 = common_layers.apply_norm( + tf.constant(x1, dtype=tf.float32), "layer", depth=11, epsilon=1e-6) session.run(tf.global_variables_initializer()) actual = session.run(x2) self.assertEqual(actual.shape, (5, 2, 1, 11)) - def testGetNormNoamFn(self): - norm_type = "noam" + def testApplyNormNoam(self): with self.test_session() as session: - a = common_layers.get_norm(norm_type) x1 = np.random.rand(5, 2, 1, 11) - x2 = a(tf.constant(x1, dtype=tf.float32), name="noam") + x2 = common_layers.apply_norm( + tf.constant(x1, dtype=tf.float32), "noam", depth=11, epsilon=1e-6) session.run(tf.global_variables_initializer()) actual = session.run(x2) self.assertEqual(actual.shape, (5, 2, 1, 11)) - def testGetNormBatchFn(self): - norm_type = "batch" + def testApplyNormBatch(self): with self.test_session() as session: - a = common_layers.get_norm(norm_type) x1 = np.random.rand(5, 2, 1, 11) - x2 = a(tf.constant(x1, dtype=tf.float32), name="batch") + x2 = common_layers.apply_norm( + tf.constant(x1, dtype=tf.float32), "batch", depth=11, epsilon=1e-6) session.run(tf.global_variables_initializer()) actual = session.run(x2) self.assertEqual(actual.shape, (5, 2, 1, 11)) - def testGetNormNoneFn(self): - norm_type = "none" + def testApplyNormNone(self): with self.test_session() as session: - a = common_layers.get_norm(norm_type) x1 = np.random.rand(5, 2, 1, 11) - x2 = a(tf.constant(x1, dtype=tf.float32), name="none") + x2 = common_layers.apply_norm( + tf.constant(x1, dtype=tf.float32), "none", depth=11, epsilon=1e-6) session.run(tf.global_variables_initializer()) actual = session.run(x2) self.assertEqual(actual.shape, (5, 2, 1, 11)) self.assertAllClose(actual, x1, atol=1e-03) - def testResidualFn(self): - norm_type = "batch" - with self.test_session() as session: - x1 = np.random.rand(5, 2, 1, 11) - x2 = np.random.rand(5, 2, 1, 11) - x3 = common_layers.residual_fn( - tf.constant(x1, dtype=tf.float32), - tf.constant(x2, dtype=tf.float32), norm_type, 0.1) - session.run(tf.global_variables_initializer()) - actual = session.run(x3) - self.assertEqual(actual.shape, (5, 2, 1, 11)) - - def testResidualFnWithLayerNorm(self): - norm_type = "layer" - with self.test_session() as session: - x1 = np.random.rand(5, 2, 1, 11) - x2 = np.random.rand(5, 2, 1, 11) - x3 = common_layers.residual_fn( - tf.constant(x1, dtype=tf.float32), - tf.constant(x2, dtype=tf.float32), - norm_type, - 0.1, - epsilon=0.1) - session.run(tf.global_variables_initializer()) - actual = session.run(x3) - self.assertEqual(actual.shape, (5, 2, 1, 11)) - def testGlobalPool1d(self): x1 = np.random.rand(5, 4, 11) no_mask = np.ones((5, 4)) diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py index 523c52fa8..e44729041 100644 --- a/tensor2tensor/layers/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -438,6 +438,7 @@ def __init__(self, model_hparams, vocab_size): @registry.register_image_modality("identity") @registry.register_symbol_modality("identity") @registry.register_class_label_modality("identity") +@registry.register_real_modality("identity") class IdentityModality(modality.Modality): """Does nothing.""" @@ -452,9 +453,12 @@ def top(self, body_output, _): return body_output -@registry.register_generic_modality("real") class RealModality(modality.Modality): - """Modality for real (i.e. float) vectors.""" + """Base class for real (i.e. float) vectors. + + * Bottom is a linear projection layer to hparams.hidden_size. + * Top is a linear projection layer to vocab_size. + """ def bottom(self, x): with tf.variable_scope("real"): @@ -464,7 +468,16 @@ def top(self, body_output, _): with tf.variable_scope("real"): return tf.layers.dense(body_output, self._vocab_size) - def loss(self, top_out, targets, weights_fn=common_layers.weights_nonzero): + def loss(self, top_out, targets, weights_fn=common_layers.weights_all): + raise NotImplementedError() + + +@registry.register_real_modality("default") +@registry.register_real_modality("l2_loss") +class RealL2LossModality(RealModality): + """Modality for real (i.e. float) vectors with L2 (Gaussian) loss.""" + + def loss(self, top_out, targets, weights_fn=common_layers.weights_all): predictions = top_out with tf.name_scope("l2"): weights = weights_fn(targets) @@ -472,6 +485,26 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_nonzero): return tf.reduce_sum(l2 * weights), tf.reduce_sum(weights) +@registry.register_real_modality("log_poisson_loss") +class RealLogPoissonLossModality(RealL2LossModality): + """Modality for real (i.e. float) vectors with log Poisson regression loss. + + * Top is a linear projection to vocab size followed by a softplus + transform (log(exp(features) + 1)). + """ + + def top(self, body_output, _): + with tf.variable_scope("real"): + return tf.nn.softplus(tf.layers.dense(body_output, self._vocab_size)) + + def loss(self, top_out, targets, weights_fn=common_layers.weights_all): + predictions = top_out + with tf.name_scope("log_possion"): + weights = weights_fn(targets) + lp_loss = tf.nn.log_poisson_loss(targets, predictions) + return tf.reduce_sum(lp_loss * weights), tf.reduce_sum(weights) + + @registry.register_image_modality("identity_no_pad") class IdentityModalityNoPad(modality.Modality): """Does nothing except making sure that there is no padding in cross-ent.""" diff --git a/tensor2tensor/layers/rev_block.py b/tensor2tensor/layers/rev_block.py new file mode 100644 index 000000000..1e1a7b848 --- /dev/null +++ b/tensor2tensor/layers/rev_block.py @@ -0,0 +1,195 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Reversible Residual Block. + +From +[The Reversible Residual Network: Backpropagation Without Storing +Activations](https://arxiv.org/abs/1707.04585). +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re + +# Dependency imports + +import tensorflow as tf +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import function + +LAYER_RE = re.compile(".*revlayer_([0-9]*)/([fg])/.*") + + +def _rev_layer_forward(xs, f, g): + """Forward for 1 reversible layer.""" + x1, x2 = xs + with tf.variable_scope("f"): + y1 = x1 + f(x2) + with tf.variable_scope("g"): + y2 = x2 + g(y1) + return (y1, y2) + + +def _rev_layer_backward(ys, grad_ys, f, g, f_vars, g_vars): + """Backprop for 1 layer.""" + y1, y2 = ys + grad_y1, grad_y2 = grad_ys + + # Reconstruct intermediates and inputs (x1, x2) + # stop_gradients required on y1 and x2 to prevent infinite recursion into this + # grad function on the calls to tf.gradients. + y1_stop = tf.stop_gradient(y1) + with tf.variable_scope("g"): + gy1 = g(y1_stop) + + x2 = y2 - gy1 + x2_stop = tf.stop_gradient(x2) + with tf.variable_scope("f"): + fx2 = f(x2_stop) + + x1 = y1 - fx2 + + # Compute gradients wrt to inputs + # dL/dy2 * dG(y1)/y1 + grad_gy1_y2 = tf.gradients(gy1, y1_stop, grad_y2)[0] + grad_x1 = grad_y1 + grad_gy1_y2 + grad_x2 = (tf.gradients(fx2, x2_stop, grad_y1)[0] + grad_y2 + tf.gradients( + fx2, x2_stop, grad_gy1_y2)[0]) + + # Compute gradients wrt to vars in f and g + grad_g_vars = tf.gradients(gy1, g_vars, grad_y2) + grad_f_y1 = tf.gradients(fx2, f_vars, grad_y1) + grad_f_y2 = tf.gradients(fx2, f_vars, grad_gy1_y2) + grad_f_vars = [tf.add_n(grads) for grads in zip(grad_f_y1, grad_f_y2)] + + return (x1, x2), (grad_x1, grad_x2), grad_f_vars, grad_g_vars + + +def _rev_block_forward(x, f, g, num_layers=1, layer_scopes=None, name=None): + """Forward for a series of reversible layers.""" + x1, x2 = tf.split(x, 2, axis=len(x.get_shape()) - 1) + out = (x1, x2) + with tf.variable_scope(name, default_name="revblock"): + for i in xrange(num_layers): + with tf.variable_scope("revlayer_%d" % i) as layer_vs: + if layer_scopes is not None: + layer_scopes.append(layer_vs) + out = _rev_layer_forward(out, f, g) + + y1, y2 = out + y = tf.concat([y1, y2], axis=-1) + return y + + +def rev_block(x, f, g, num_layers=1, is_training=True): + """A block of reversible residual layers. + + A reversible residual layer is defined as: + + ``` + x1, x2 = tf.split(x, 2, axis=-1) + y1 = x1 + f(x2) + y2 = x2 + g(y1) + y = tf.concat([y1, y2], axis=-1) + ``` + + Args: + x: a float Tensor, input, will be split evenly across the last dim. + f: a function, (Tensor) -> (Tensor). Should not change the shape of the + Tensor. May create variables. Should NOT close over any Tensor values. + g: a function, (Tensor) -> (Tensor). Should not change the shape of the + Tensor. May create variables. Should NOT close over any Tensor values. + num_layers: int, number of reversible residual layers. Each layer will + apply f and g according to the equations above, with new variables in each + layer. + is_training: bool, whether to actually use the efficient backprop codepath. + + Returns: + y: a float Tensor, output. + """ + layer_scopes = [] + + def rev_block_grad(op, grad_y): + """Custom gradient fn for a block of reversible residual layers.""" + y = op.outputs[0] + ys = tf.split(y, 2, axis=len(y.get_shape()) - 1) + grad_ys = tf.split(grad_y, 2, axis=len(y.get_shape()) - 1) + + # Find all variables from f and from g + # Keep track of their positions in all_vars + all_vars = op.inputs[1:] + f_vars = [[] for _ in range(num_layers)] + g_vars = [[] for _ in range(num_layers)] + f_vars_idxs = [[] for _ in range(num_layers)] + g_vars_idxs = [[] for _ in range(num_layers)] + + for i, v in enumerate(all_vars): + ref = v.op.inputs[0] + assert ref.dtype == dtypes.float32_ref + regex = LAYER_RE.match(v.name) + layer_no = int(regex.group(1)) + fn_name = regex.group(2) + if fn_name == "f": + f_vars[layer_no].append(ref) + f_vars_idxs[layer_no].append(i) + else: + assert fn_name == "g" + g_vars[layer_no].append(ref) + g_vars_idxs[layer_no].append(i) + + f_grads = [] + g_grads = [] + + # Reverse state containers to go backward + layer_scopes.reverse() + f_vars.reverse() + g_vars.reverse() + + for i in xrange(num_layers): + with tf.variable_scope(layer_scopes[i], reuse=True): + ys, grad_ys, grad_f_vars, grad_g_vars = _rev_layer_backward( + ys, grad_ys, f, g, f_vars[i], g_vars[i]) + f_grads.append(grad_f_vars) + g_grads.append(grad_g_vars) + + # Gradients were collected in reverse layer order + f_grads.reverse() + g_grads.reverse() + + # Reorder the gradients so they match the original order of all_vars + var_grads = [None] * len(all_vars) + for idxs, grads in zip(f_vars_idxs, f_grads) + zip(g_vars_idxs, g_grads): + for i, grad in zip(idxs, grads): + var_grads[i] = grad + + grad_x = tf.concat(grad_ys, axis=-1) + all_grads = [grad_x] + var_grads + return all_grads + + @function.Defun( + tf.float32, + python_grad_func=rev_block_grad, + shape_func=lambda _: [x.get_shape()]) + def rev_block_defun(inp): + inp.set_shape(x.get_shape()) + return _rev_block_forward( + inp, f, g, num_layers=num_layers, layer_scopes=layer_scopes) + + if is_training: + return rev_block_defun(x) + else: + return _rev_block_forward(x, f, g, num_layers=num_layers) diff --git a/tensor2tensor/layers/rev_block_test.py b/tensor2tensor/layers/rev_block_test.py new file mode 100644 index 000000000..bc4bcc6a4 --- /dev/null +++ b/tensor2tensor/layers/rev_block_test.py @@ -0,0 +1,92 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for RevBlock.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.layers import rev_block + +import tensorflow as tf + + +class RevBlockTest(tf.test.TestCase): + + def testSmoke(self): + channels = 8 + num_layers = 4 + batch_size = 16 + use_defun = True + tf.set_random_seed(1234) + + def f(x): + return tf.layers.dense(x, channels // 2, use_bias=True) + + def g(x): + return tf.layers.dense(x, channels // 2, use_bias=True) + + x = tf.random_uniform([batch_size, channels], dtype=tf.float32) + y = rev_block.rev_block( + x, f, g, num_layers=num_layers, is_training=use_defun) + loss = tf.reduce_mean(y + 10.) + grads = tf.gradients(loss, [x] + tf.global_variables()) + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + _ = sess.run(grads) + + def testRevBlock(self): + channels = 8 + num_layers = 4 + batch_size = 16 + tf.set_random_seed(1234) + + def f(x): + return tf.layers.dense(x, channels // 2, use_bias=True) + + def g(x): + return tf.layers.dense(x, channels // 2, use_bias=True) + + x = tf.random_uniform([batch_size, channels], dtype=tf.float32) + + with tf.variable_scope("defun") as vs: + y_defun = rev_block.rev_block(x, f, g, num_layers=num_layers) + fg_vars = vs.trainable_variables() + + num_vars = len(tf.global_variables()) + with tf.variable_scope(vs, reuse=True): + y = rev_block.rev_block(x, f, g, num_layers=num_layers, is_training=False) + # Ensure no new vars were created - full reuse + assert len(tf.global_variables()) == num_vars + + loss_defun = tf.reduce_mean(y_defun + 10.) + loss = tf.reduce_mean(y + 10.) + + grads_defun = tf.gradients(loss_defun, [x] + fg_vars) + grads = tf.gradients(loss, [x] + fg_vars) + + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + y_val, yd_val, gd_val, g_val = sess.run([y, y_defun, grads_defun, grads]) + self.assertAllClose(y_val, yd_val) + for g1, g2 in zip(gd_val, g_val): + self.assertAllClose(g1, g2) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py index 664bc9e21..19f1915e8 100644 --- a/tensor2tensor/models/attention_lm.py +++ b/tensor2tensor/models/attention_lm.py @@ -51,13 +51,10 @@ def model_fn_body(self, features): (decoder_input, decoder_self_attention_bias) = attention_lm_prepare_decoder( targets, hparams) - def residual_fn(x, y): - return common_layers.layer_norm(x + tf.nn.dropout( - y, 1.0 - hparams.residual_dropout)) - - decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) - decoder_output = attention_lm_decoder(decoder_input, residual_fn, - decoder_self_attention_bias, hparams) + decoder_input = tf.nn.dropout( + decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) + decoder_output = attention_lm_decoder( + decoder_input, decoder_self_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output @@ -84,7 +81,6 @@ def attention_lm_prepare_decoder(targets, hparams): def attention_lm_decoder(decoder_input, - residual_fn, decoder_self_attention_bias, hparams, name="decoder"): @@ -92,7 +88,6 @@ def attention_lm_decoder(decoder_input, Args: decoder_input: a Tensor - residual_fn: a function from (layer_input, layer_output) -> combined_output decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model @@ -105,25 +100,25 @@ def attention_lm_decoder(decoder_input, with tf.variable_scope(name): for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): - x = residual_fn( - x, - common_attention.multihead_attention( - x, - None, - decoder_self_attention_bias, - hparams.attention_key_channels or hparams.hidden_size, - hparams.attention_value_channels or hparams.hidden_size, - hparams.hidden_size, - hparams.num_heads, - hparams.attention_dropout, - name="decoder_self_attention")) - x = residual_fn(x, - common_layers.conv_hidden_relu( - x, - hparams.filter_size, - hparams.hidden_size, - dropout=hparams.relu_dropout)) - return x + with tf.variable_scope("self_attention"): + y = common_attention.multihead_attention( + common_layers.layer_preprocess(x, hparams), + None, + decoder_self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout) + x = common_layers.layer_postprocess(x, y, hparams) + with tf.variable_scope("ffn"): + y = common_layers.conv_hidden_relu( + common_layers.layer_preprocess(x, hparams), + hparams.filter_size, + hparams.hidden_size, + dropout=hparams.relu_dropout) + x = common_layers.layer_postprocess(x, y, hparams) + return common_layers.layer_preprocess(x, hparams) @registry.register_hparams @@ -145,7 +140,6 @@ def attention_lm_base(): hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 - hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(False) @@ -158,7 +152,6 @@ def attention_lm_base(): # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) - hparams.add_hparam("residual_dropout", 0.1) hparams.add_hparam("pos", "timing") # timing, none return hparams @@ -178,5 +171,19 @@ def attention_lm_small(): hparams.num_hidden_layers = 4 hparams.hidden_size = 512 hparams.filter_size = 2048 - hparams.residual_dropout = 0.5 + hparams.layer_prepostprocess_dropout = 0.5 + return hparams + + +@registry.register_hparams +def attention_lm_translation(): + """Version to use for seq2seq.""" + hparams = attention_lm_base() + hparams.layer_preprocess_sequence = "n" + hparams.layer_postprocess_sequence = "da" + hparams.learning_rate = 0.1 + hparams.prepend_inputs_to_targets = int(True) + hparams.max_length = 512 + hparams.label_smoothing = 0.1 + hparams.shared_embedding_and_softmax_weights = int(True) return hparams diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index 780478fec..1869eef66 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -49,21 +49,24 @@ def model_fn_body_sharded(self, sharded_features): targets = sharded_features["targets"] targets = dp(tf.squeeze, targets, 2) + def preprocess(x): + return dp(common_layers.layer_preprocess, x, hparams) + + def postprocess(x, y): + return dp(common_layers.layer_postprocess, x, y, hparams) + (decoder_input, decoder_self_attention_bias) = dp( attention_lm_moe_prepare_decoder, targets, hparams) - def residual_fn(x, y): - return common_layers.layer_norm(x + tf.nn.dropout( - y, 1.0 - hparams.residual_dropout)) - - x = dp(tf.nn.dropout, decoder_input, 1.0 - hparams.residual_dropout) + x = dp(tf.nn.dropout, decoder_input, + 1.0 - hparams.layer_prepostprocess_dropout) extra_loss = 0.0 for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("attention"): y = dp( common_attention.multihead_attention, - x, + preprocess(x), None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, @@ -72,11 +75,11 @@ def residual_fn(x, y): hparams.num_heads, hparams.attention_dropout, name="decoder_self_attention") - x = dp(residual_fn, x, y) + x = postprocess(x, y) with tf.variable_scope("ffn"): if str(layer) in hparams.moe_layers.split(","): y, loss = common_layers.moe_layer( - dp, self._ps_devices, x, + dp, self._ps_devices, preprocess(x), hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1, hparams.moe_n2, hparams.moe_loss_coef) @@ -84,11 +87,12 @@ def residual_fn(x, y): else: y = dp( common_layers.conv_hidden_relu, - x, + preprocess(x), hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) - x = dp(residual_fn, x, y) + x = postprocess(x, y) + x = preprocess(x) decoder_output = dp(tf.expand_dims, x, 2) return decoder_output, extra_loss @@ -163,7 +167,6 @@ def attention_lm_moe_base(): # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) - hparams.add_hparam("residual_dropout", 0.1) hparams.add_hparam("pos", "timing") # timing, none return hparams @@ -232,5 +235,5 @@ def attention_lm_moe_large(): hparams.filter_size = 4096 hparams.moe_hidden_size = 4096 hparams.moe_n1 = 128 - hparams.residual_dropout = 0.2 + hparams.layer_prepostprocess_dropout = 0.2 return hparams diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py index d9c4e29a9..e4537ef3f 100644 --- a/tensor2tensor/models/bytenet.py +++ b/tensor2tensor/models/bytenet.py @@ -40,13 +40,13 @@ def residual_dilated_conv(x, repeat, padding, name, hparams): for i in xrange(repeat): with tf.variable_scope("repeat_%d" % i): y = common_layers.conv_block( - x, + common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"), hparams.hidden_size, dilations_and_kernels, padding=padding, name="residual_conv") - x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm") - x = tf.nn.dropout(x, hparams.dropout) + y = tf.nn.dropout(y, 1.0 - hparams.dropout) + x += y return x diff --git a/tensor2tensor/models/cycle_gan.py b/tensor2tensor/models/cycle_gan.py new file mode 100644 index 000000000..5fcf96266 --- /dev/null +++ b/tensor2tensor/models/cycle_gan.py @@ -0,0 +1,204 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Cycle GAN.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.layers import common_layers +from tensor2tensor.models import transformer_vae +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +def reconstruct_loss(x, gt, hparams, reuse=None): + pred = tf.layers.dense(x, hparams.vocab_size, name="softmax", reuse=reuse) + xent, w = common_layers.padded_cross_entropy(pred, gt, 0.0) + return xent / w + + +def discriminator(x, compress, hparams, name, reuse=None): + with tf.variable_scope(name, reuse=reuse): + x = tf.stop_gradient(2 * x) - x # Reverse gradient. + if compress: + x = transformer_vae.compress(x, hparams, "compress") + else: + x = transformer_vae.residual_conv(x, 1, hparams, "compress_rc") + y = tf.reduce_mean(x, axis=1) + return tf.tanh(tf.layers.dense(y, 1, name="reduce")) + + +def discriminate_loss(x, y, compress, hparams, name): + with tf.variable_scope(name): + d1 = discriminator(x, compress, hparams, "discriminator") + d2 = discriminator(y, compress, hparams, "discriminator", reuse=True) + dloss = tf.reduce_mean(tf.abs(d1 - d2)) + return - dloss + + +def split_on_batch(x): + batch_size = tf.shape(x)[0] + i = batch_size // 2 + return x[:i, :, :, :], x[i:2*i, :, :, :] + + +def cycle_gan_internal(inputs, targets, _, hparams): + """Cycle GAN, main step used for training.""" + with tf.variable_scope("cycle_gan"): + # Embed inputs and targets. + inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets) + inputs = common_layers.embedding( + inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed") + targets = common_layers.embedding( + targets_orig, hparams.vocab_size, hparams.hidden_size, + "embed", reuse=True) + + # Split the batch into input-input and target-target parts. + inputs1, _ = split_on_batch(inputs) + _, targets2 = split_on_batch(targets) + + # Define F and G, called inp2tgt and tgt2inp here. + def inp2tgt(x, reuse=False): + return transformer_vae.residual_conv(x, 1, hparams, "inp2tgt", reuse) + def tgt2inp(x, reuse=False): + return transformer_vae.residual_conv(x, 1, hparams, "tgt2inp", reuse) + + # Input-input part. + inp1_tgt = inp2tgt(inputs1) + inp1_back = tgt2inp(inp1_tgt) + + # Target-target part. + tgt2_inp = tgt2inp(targets2, reuse=True) + tgt2_back = inp2tgt(tgt2_inp, reuse=True) + + # Reconstruction losses. + inp1_orig, _ = split_on_batch(inputs_orig) + _, tgt2_orig = split_on_batch(targets_orig) + inp1_loss = reconstruct_loss( + inp1_back, tf.squeeze(inp1_orig, axis=3), hparams) + tgt2_loss = reconstruct_loss( + tgt2_back, tf.squeeze(tgt2_orig, axis=3), hparams, reuse=True) + + # Discriminator losses. + dloss1 = discriminate_loss(inputs1, tgt2_inp, True, hparams, "inp_disc") + dloss2 = discriminate_loss(targets2, inp1_tgt, True, hparams, "tgt_disc") + + # Reconstruct targets from inputs. + tgt = inp2tgt(inputs, reuse=True) + tgt = tf.layers.dense(tgt, hparams.vocab_size, name="softmax", reuse=True) + + # We use the reconstruction only for tracking progress, no gradients here! + tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2)) + + losses = {"input_input": hparams.cycle_loss_multiplier * inp1_loss, + "target_target": hparams.cycle_loss_multiplier * tgt2_loss, + "input_disc": dloss1, + "target_disc": dloss2} + return tgt, losses + + +@registry.register_model +class CycleGAN(t2t_model.T2TModel): + + def model_fn_body(self, features): + return cycle_gan_internal( + features["inputs"], features["targets"], features["target_space_id"], + self._hparams) + + +def cycle_vae_gan_internal(inputs, targets, _, hparams): + """Cycle GAN, main step used for training.""" + with tf.variable_scope("cycle_vae_gan"): + # Embed inputs and targets. + inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets) + k = 2**hparams.num_compress_steps + inputs_orig, targets_orig = common_layers.pad_to_same_length( + inputs_orig, targets_orig, final_length_divisible_by=k) + inputs = common_layers.embedding( + inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed") + targets = common_layers.embedding( + targets_orig, hparams.vocab_size, hparams.hidden_size, + "embed", reuse=True) + + # Split the batch into input-input and target-target parts. + inputs1, _ = split_on_batch(inputs) + _, targets2 = split_on_batch(targets) + + # Input-input part. + inp1_back, kl_loss1, inp1_mu, inp1_log_sigma = transformer_vae.vae_compress( + inputs1, hparams, "inp2hyp", "hyp2inp") + inp1_hyp = tf.concat([inp1_mu, inp1_log_sigma], axis=3) + + # Target-target part. + tgt2_back, kl_loss2, tgt2_mu, tgt2_log_sigma = transformer_vae.vae_compress( + targets2, hparams, "tgt2hyp", "hyp2tgt") + tgt2_hyp = tf.concat([tgt2_mu, tgt2_log_sigma], axis=3) + + # Reconstruction losses. + inp1_orig, _ = split_on_batch(inputs_orig) + _, tgt2_orig = split_on_batch(targets_orig) + inp1_loss = reconstruct_loss( + inp1_back, tf.squeeze(inp1_orig, axis=3), hparams) + tgt2_loss = reconstruct_loss( + tgt2_back, tf.squeeze(tgt2_orig, axis=3), hparams, reuse=True) + + # Discriminator loss. + dloss = discriminate_loss(inp1_hyp, tgt2_hyp, False, hparams, "dloss") + + # Reconstruct targets from inputs. + tgt, _, _, _ = transformer_vae.vae_compress( + inputs, hparams, "inp2hyp", "hyp2tgt", reuse=True) + tgt = tf.layers.dense(tgt, hparams.vocab_size, name="softmax", reuse=True) + # We use the reconstruction only for tracking progress, no gradients here! + tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2)) + + kl_rev_decay = common_layers.inverse_exp_decay(hparams.kl_warmup_steps) + losses = {"input_input": hparams.cycle_loss_multiplier * inp1_loss, + "target_target": hparams.cycle_loss_multiplier * tgt2_loss, + "input_kl": kl_loss1 * kl_rev_decay, + "target_kl": kl_loss2 * kl_rev_decay, + "discriminator": dloss} + return tgt, losses + + +@registry.register_model +class CycleVaeGAN(t2t_model.T2TModel): + + def model_fn_body(self, features): + return cycle_vae_gan_internal( + features["inputs"], features["targets"], features["target_space_id"], + self._hparams) + + +@registry.register_hparams +def cycle_gan_small(): + """Set of hyperparameters.""" + hparams = transformer_vae.transformer_vae_small() + hparams.batch_size = 2048 + hparams.input_modalities = "inputs:symbol:identity" + hparams.target_modality = "symbol:identity" + hparams.weight_decay = 3.0 + hparams.learning_rate = 0.005 + hparams.kl_warmup_steps = 5000 + hparams.learning_rate_warmup_steps = 3000 + hparams.add_hparam("vocab_size", 32) # Vocabulary size, need to set here. + hparams.add_hparam("cycle_loss_multiplier", 2.0) + return hparams diff --git a/tensor2tensor/models/gene_expression.py b/tensor2tensor/models/gene_expression.py index af2d83158..27aa631c6 100644 --- a/tensor2tensor/models/gene_expression.py +++ b/tensor2tensor/models/gene_expression.py @@ -38,6 +38,14 @@ class GeneExpressionConv(t2t_model.T2TModel): http://www.biorxiv.org/content/early/2017/07/10/161851 Uses layer_norm instead of batch_norm. + + Model expects that if targets are of length m, inputs are of length 32*m. The + original data expected that inputs would be of length 128*m, but the data has + been preprocessed to chunk every 4 bases into 1 ID (see + data_generators/gene_expression.py). + + The magnitude of the length reduction is controlled by the pooling sizes + (hparams.pooling_windows) at each conv layer (hparams.num_conv_layers). """ def model_fn_body(self, features): @@ -50,6 +58,7 @@ def model_fn_body(self, features): out = common_layers.flatten4d3d(out) # Conv layers + assert hp.num_conv_layers == len(hp.pooling_windows) for i in xrange(hp.num_conv_layers): out = conv_layer( out, @@ -58,7 +67,7 @@ def model_fn_body(self, features): hp.stride, hp.pooling_windows[i], hp.dropout, - 1, + dilation_rate=1, name="conv_%d" % (i + 1)) # Dense dilated conv layers @@ -68,10 +77,10 @@ def model_fn_body(self, features): out, hp.hidden_size, hp.kernel_width, - 1, - 0, - hp.dropout, - dilation_rate, + stride=1, + pooling_window=0, + dropout_rate=hp.dropout, + dilation_rate=dilation_rate, name="dconv_%d" % (i + 1)) out = tf.concat([out, dconv_out], axis=2) @@ -121,12 +130,16 @@ def fc_layer(x, num_out, dropout_rate, name="fc"): def gene_expression_conv_base(): """Hparams for GeneExpressionConv model.""" hparams = common_hparams.basic_params1() + hparams.max_length = 10000000 + hparams.batch_size = 1024 + hparams.dropout = 0.1 hparams.add_hparam("num_conv_layers", 4) hparams.add_hparam("num_dconv_layers", 7) - hparams.add_hparam("pooling_windows", [2, 4, 4, 4]) + # The product of these pooling windows should match + # input_length/target_length. + hparams.add_hparam("pooling_windows", [2, 2, 2, 4]) - # TODO(rsepassi): Correct the values of these hyperparameters - hparams.hidden_size = 128 - hparams.kernel_width = 128 + hparams.hidden_size = 256 + hparams.kernel_width = 20 hparams.add_hparam("stride", 1) return hparams diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py index 3b1dc6873..e2307f49f 100644 --- a/tensor2tensor/models/gene_expression_test.py +++ b/tensor2tensor/models/gene_expression_test.py @@ -42,7 +42,7 @@ def _testModel(self, hparams, model_cls): batch_size = 3 target_length = 6 target_out = 10 # GeneExpressionProblem.num_output_predictions - input_length = target_length * 128 + input_length = target_length * 128 // 4 # chunk_size=4 input_vocab_size = 5 inputs = np.random.random_integers( diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py index 4b1355dba..cba779fc9 100644 --- a/tensor2tensor/models/models.py +++ b/tensor2tensor/models/models.py @@ -28,6 +28,7 @@ from tensor2tensor.models import attention_lm_moe from tensor2tensor.models import bluenet from tensor2tensor.models import bytenet +from tensor2tensor.models import cycle_gan from tensor2tensor.models import gene_expression from tensor2tensor.models import long_answer from tensor2tensor.models import lstm diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py index 8900e6d11..1079659b5 100644 --- a/tensor2tensor/models/slicenet.py +++ b/tensor2tensor/models/slicenet.py @@ -111,7 +111,10 @@ def multi_conv_res(x, padding, name, layers, hparams, mask=None, source=None): hparams.separability - i for i in reversed(range(len(dilations_and_kernels2))) ] - norm_fn = common_layers.get_norm(hparams.norm_type) + def norm_fn(x, name): + with tf.variable_scope(name, default_name="norm"): + return common_layers.apply_norm( + x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon) for layer in xrange(layers): with tf.variable_scope("layer_%d" % layer): y = common_layers.subseparable_conv_block( @@ -171,7 +174,10 @@ def similarity_cost(inputs_encoded, targets_encoded): def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams): """Middle part of slicenet, connecting encoder and decoder.""" - norm_fn = common_layers.get_norm(hparams.norm_type) + def norm_fn(x, name): + with tf.variable_scope(name, default_name="norm"): + return common_layers.apply_norm( + x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon) # Flatten targets and embed target_space_id. targets_flat = tf.expand_dims(common_layers.flatten4d3d(targets), axis=2) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index c9c87da07..caf8ab198 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -55,36 +55,22 @@ def model_fn_body(self, features): (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder( targets, hparams) - residual_fn = get_residual_fn(hparams) - - encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) - decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) - encoder_output = transformer_encoder(encoder_input, residual_fn, - encoder_self_attention_bias, hparams) + encoder_input = tf.nn.dropout( + encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) + decoder_input = tf.nn.dropout( + decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) + encoder_output = transformer_encoder( + encoder_input, encoder_self_attention_bias, hparams) decoder_output = transformer_decoder( - decoder_input, encoder_output, residual_fn, decoder_self_attention_bias, + decoder_input, encoder_output, + decoder_self_attention_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output -def get_residual_fn(hparams): - """Get residual_fn.""" - - def residual_fn(x, y): - return common_layers.residual_fn( - x, - y, - hparams.norm_type, - hparams.residual_dropout, - hparams.hidden_size, - epsilon=hparams.layer_norm_epsilon) - - return residual_fn - - def transformer_prepare_encoder(inputs, target_space, hparams): """Prepare one shard of the model for the encoder. @@ -143,7 +129,6 @@ def transformer_prepare_decoder(targets, hparams): def transformer_encoder(encoder_input, - residual_fn, encoder_self_attention_bias, hparams, name="encoder"): @@ -151,7 +136,6 @@ def transformer_encoder(encoder_input, Args: encoder_input: a Tensor - residual_fn: a function from (layer_input, layer_output) -> combined_output encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model @@ -164,25 +148,29 @@ def transformer_encoder(encoder_input, with tf.variable_scope(name): for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): - x = residual_fn( - x, - common_attention.multihead_attention( - x, - None, - encoder_self_attention_bias, - hparams.attention_key_channels or hparams.hidden_size, - hparams.attention_value_channels or hparams.hidden_size, - hparams.hidden_size, - hparams.num_heads, - hparams.attention_dropout, - name="encoder_self_attention")) - x = residual_fn(x, transformer_ffn_layer(x, hparams)) - return x + with tf.variable_scope("self_attention"): + y = common_attention.multihead_attention( + common_layers.layer_preprocess(x, hparams), + None, + encoder_self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout) + x = common_layers.layer_postprocess(x, y, hparams) + with tf.variable_scope("ffn"): + y = transformer_ffn_layer( + common_layers.layer_preprocess(x, hparams), hparams) + x = common_layers.layer_postprocess(x, y, hparams) + # if normalization is done in layer_preprocess, then it shuold also be done + # on the output, since the output can grow very large, being the sum of + # a whole stack of unnormalized layer outputs. + return common_layers.layer_preprocess(x, hparams) def transformer_decoder(decoder_input, encoder_output, - residual_fn, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, @@ -192,7 +180,6 @@ def transformer_decoder(decoder_input, Args: decoder_input: a Tensor encoder_output: a Tensor - residual_fn: a function from (layer_input, layer_output) -> combined_output decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention @@ -207,32 +194,36 @@ def transformer_decoder(decoder_input, with tf.variable_scope(name): for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): - x = residual_fn( - x, - common_attention.multihead_attention( - x, - None, - decoder_self_attention_bias, - hparams.attention_key_channels or hparams.hidden_size, - hparams.attention_value_channels or hparams.hidden_size, - hparams.hidden_size, - hparams.num_heads, - hparams.attention_dropout, - name="decoder_self_attention")) - x = residual_fn( - x, - common_attention.multihead_attention( - x, - encoder_output, - encoder_decoder_attention_bias, - hparams.attention_key_channels or hparams.hidden_size, - hparams.attention_value_channels or hparams.hidden_size, - hparams.hidden_size, - hparams.num_heads, - hparams.attention_dropout, - name="encdec_attention")) - x = residual_fn(x, transformer_ffn_layer(x, hparams)) - return x + with tf.variable_scope("self_attention"): + y = common_attention.multihead_attention( + common_layers.layer_preprocess(x, hparams), + None, + decoder_self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout) + x = common_layers.layer_postprocess(x, y, hparams) + with tf.variable_scope("encdec_attention"): + y = common_attention.multihead_attention( + common_layers.layer_preprocess(x, hparams), + encoder_output, + encoder_decoder_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout) + x = common_layers.layer_postprocess(x, y, hparams) + with tf.variable_scope("ffn"): + y = transformer_ffn_layer( + common_layers.layer_preprocess(x, hparams), hparams) + x = common_layers.layer_postprocess(x, y, hparams) + # if normalization is done in layer_preprocess, then it shuold also be done + # on the output, since the output can grow very large, being the sum of + # a whole stack of unnormalized layer outputs. + return common_layers.layer_preprocess(x, hparams) def transformer_ffn_layer(x, hparams): @@ -307,13 +298,39 @@ def transformer_base(): # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) - hparams.add_hparam("residual_dropout", 0.1) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) hparams.add_hparam("proximity_bias", int(False)) return hparams +@registry.register_hparams +def transformer_n_da(): + """Normalize on layer input, instead of after residual connection. + + This version seems to cure failure-to-learn bugs - for example, with very + deep networks or hard-to-learn mappings. + + Probably this should become the default. + + Returns: + a hyperparameters. + """ + hparams = transformer_base() + hparams.layer_preprocess_sequence = "n" + hparams.layer_postprocess_sequence = "da" + # This version seems to benefit from a higher learning rate. + hparams.learning_rate = 0.4 + return hparams + + +@registry.register_hparams +def transformer_n_da_l10(): + hparams = transformer_n_da() + hparams.num_hidden_layers = 10 + return hparams + + @registry.register_hparams def transformer_big(): """HParams for transfomer big model on WMT.""" @@ -322,7 +339,7 @@ def transformer_big(): hparams.filter_size = 4096 hparams.num_heads = 16 hparams.batching_mantissa_bits = 2 - hparams.residual_dropout = 0.3 + hparams.layer_prepostprocess_dropout = 0.3 return hparams @@ -330,7 +347,7 @@ def transformer_big(): def transformer_big_single_gpu(): """HParams for transformer big model for single gpu.""" hparams = transformer_big() - hparams.residual_dropout = 0.1 + hparams.layer_prepostprocess_dropout = 0.1 hparams.learning_rate_warmup_steps = 16000 hparams.optimizer_adam_beta2 = 0.998 hparams.batching_mantissa_bits = 3 @@ -352,7 +369,7 @@ def transformer_parsing_base(): """Hparams for parsing on wsj only.""" hparams = transformer_base() hparams.attention_dropout = 0.2 - hparams.residual_dropout = 0.2 + hparams.layer_prepostprocess_dropout = 0.2 hparams.max_length = 512 hparams.learning_rate_warmup_steps = 16000 hparams.hidden_size = 1024 @@ -368,7 +385,7 @@ def transformer_parsing_big(): hparams.max_length = 512 hparams.shared_source_target_embedding = int(False) hparams.learning_rate_warmup_steps = 4000 - hparams.residual_dropout = 0.1 + hparams.layer_prepostprocess_dropout = 0.1 hparams.batch_size = 2048 hparams.learning_rate = 0.05 return hparams @@ -424,6 +441,13 @@ def transformer_l8(): return hparams +@registry.register_hparams +def transformer_l10(): + hparams = transformer_base() + hparams.num_hidden_layers = 10 + return hparams + + @registry.register_hparams def transformer_h1(): hparams = transformer_base() @@ -483,14 +507,14 @@ def transformer_ff4096(): @registry.register_hparams def transformer_dr0(): hparams = transformer_base() - hparams.residual_dropout = 0.0 + hparams.layer_prepostprocess_dropout = 0.0 return hparams @registry.register_hparams def transformer_dr2(): hparams = transformer_base() - hparams.residual_dropout = 0.2 + hparams.layer_prepostprocess_dropout = 0.2 return hparams @@ -528,7 +552,7 @@ def transformer_big_dr1(): hparams.hidden_size = 1024 hparams.filter_size = 4096 hparams.num_heads = 16 - hparams.residual_dropout = 0.1 + hparams.layer_prepostprocess_dropout = 0.1 hparams.batching_mantissa_bits = 2 return hparams @@ -538,14 +562,14 @@ def transformer_big_enfr(): hparams = transformer_big_dr1() hparams.shared_embedding_and_softmax_weights = int(False) hparams.filter_size = 8192 - hparams.residual_dropout = 0.1 + hparams.layer_prepostprocess_dropout = 0.1 return hparams @registry.register_hparams def transformer_big_dr2(): hparams = transformer_big_dr1() - hparams.residual_dropout = 0.2 + hparams.layer_prepostprocess_dropout = 0.2 return hparams diff --git a/tensor2tensor/models/transformer_moe.py b/tensor2tensor/models/transformer_moe.py index 8072f2cf8..6f01667d8 100644 --- a/tensor2tensor/models/transformer_moe.py +++ b/tensor2tensor/models/transformer_moe.py @@ -49,17 +49,22 @@ def model_fn_body_sharded(self, sharded_features): inputs = dp(common_layers.flatten4d3d, inputs) targets = dp(common_layers.flatten4d3d, targets) + def preprocess(x): + return dp(common_layers.layer_preprocess, x, hparams) + + def postprocess(x, y): + return dp(common_layers.layer_postprocess, x, y, hparams) + (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = dp( transformer.transformer_prepare_encoder, inputs, target_space, hparams) (decoder_input, decoder_self_attention_bias) = dp( transformer.transformer_prepare_decoder, targets, hparams) - residual_fn = transformer.get_residual_fn(hparams) encoder_input = dp(tf.nn.dropout, encoder_input, - 1.0 - hparams.residual_dropout) + 1.0 - hparams.layer_prepostprocess_dropout) decoder_input = dp(tf.nn.dropout, decoder_input, - 1.0 - hparams.residual_dropout) + 1.0 - hparams.layer_prepostprocess_dropout) extra_loss = 0 x = encoder_input for layer in xrange(hparams.num_hidden_layers): @@ -67,7 +72,7 @@ def model_fn_body_sharded(self, sharded_features): with tf.variable_scope("encoder_self_attention"): y = dp( common_attention.multihead_attention, - x, + preprocess(x), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, @@ -75,11 +80,11 @@ def model_fn_body_sharded(self, sharded_features): hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) - x = dp(residual_fn, x, y) + x = postprocess(x, y) with tf.variable_scope("ffn"): if str(layer) in hparams.moe_layers_encoder.split(","): y, loss = common_layers.moe_layer( - dp, self._ps_devices, x, + dp, self._ps_devices, preprocess(x), hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1, hparams.moe_n2, hparams.moe_loss_coef) @@ -87,19 +92,19 @@ def model_fn_body_sharded(self, sharded_features): else: y = dp( common_layers.conv_hidden_relu, - x, + preprocess(x), hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) - x = dp(residual_fn, x, y) - encoder_output = x + x = postprocess(x, y) + encoder_output = preprocess(x) x = decoder_input for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("decoder_layer_%d" % layer): with tf.variable_scope("decoder_self_attention"): y = dp( common_attention.multihead_attention, - x, + preprocess(x), None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, @@ -107,11 +112,11 @@ def model_fn_body_sharded(self, sharded_features): hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) - x = dp(residual_fn, x, y) + x = postprocess(x, y) with tf.variable_scope("encoder_decoder_attention"): y = dp( common_attention.multihead_attention, - x, + preprocess(x), encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, @@ -119,11 +124,11 @@ def model_fn_body_sharded(self, sharded_features): hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) - x = dp(residual_fn, x, y) + x = postprocess(x, y) with tf.variable_scope("ffn"): if str(layer) in hparams.moe_layers_decoder.split(","): y, loss = common_layers.moe_layer( - dp, self._ps_devices, x, + dp, self._ps_devices, preprocess(x), hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1, hparams.moe_n2, hparams.moe_loss_coef) @@ -131,11 +136,12 @@ def model_fn_body_sharded(self, sharded_features): else: y = dp( common_layers.conv_hidden_relu, - x, + preprocess(x), hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) - x = dp(residual_fn, x, y) + x = postprocess(x, y) + x = preprocess(x) decoder_output = dp(tf.expand_dims, x, 2) return decoder_output, extra_loss @@ -178,7 +184,6 @@ def transformer_moe_base(): # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) - hparams.add_hparam("residual_dropout", 0.1) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) hparams.add_hparam("proximity_bias", int(False)) diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index 31de7bd5f..74f1e4c8f 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -31,13 +31,31 @@ import tensorflow as tf -def decompress(source, hparams, name): +def residual_conv(x, repeat, hparams, name, reuse=None): + """A stack of convolution blocks with residual connections.""" + with tf.variable_scope(name, reuse=reuse): + k = (3, 1) + dilations_and_kernels = [((1, 1), k) for _ in xrange(3)] + for i in xrange(repeat): + with tf.variable_scope("repeat_%d" % i): + y = common_layers.conv_block( + common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"), + hparams.hidden_size, + dilations_and_kernels, + padding="SAME", + name="residual_conv") + y = tf.nn.dropout(y, 1.0 - hparams.dropout) + x += y + return x + + +def decompress_step(source, hparams, first_relu, name): """Decompression function.""" with tf.variable_scope(name): shape = tf.shape(source) thicker = common_layers.conv_block( source, hparams.hidden_size * 2, [((1, 1), (1, 1))], - name="decompress_conv") + first_relu=first_relu, name="decompress_conv") return tf.reshape(thicker, [shape[0], shape[1] * 2, 1, hparams.hidden_size]) @@ -48,28 +66,62 @@ def vae(x, hparams, name): shape = tf.shape(x) epsilon = tf.random_normal([shape[0], shape[1], 1, hparams.z_size]) z = mu + tf.exp(log_sigma / 2) * epsilon - dense = tf.layers.dense(z, hparams.hidden_size, name="z_to_dense") kl = 0.5 * tf.reduce_mean( tf.exp(log_sigma) + tf.square(mu) - 1. - log_sigma, axis=-1) - return dense, tf.reduce_mean(kl) + return z, tf.reduce_mean(kl), mu, log_sigma -def compress_vae(inputs, hparams, name): - """Compress, then VAE.""" +def compress(inputs, hparams, name): + """Compress.""" with tf.variable_scope(name): # Run compression by strided convs. - cur = tf.expand_dims(inputs, axis=2) + cur = inputs for i in xrange(hparams.num_compress_steps): + cur = residual_conv(cur, 1, hparams, "compress_rc_%d" % i) cur = common_layers.conv_block( cur, hparams.hidden_size, [((1, 1), (2, 1))], strides=(2, 1), name="compress_%d" % i) + return cur + +def vae_compress(inputs, hparams, compress_name, decompress_name, reuse=None): + """Compress, then VAE.""" + with tf.variable_scope(compress_name, reuse=reuse): + cur = compress(inputs, hparams, "compress") # Convolve and ReLu to get state. cur = common_layers.conv_block( cur, hparams.hidden_size, [((1, 1), (1, 1))], name="mid_conv") + z, kl_loss, mu, log_sigma = vae(cur, hparams, name="vae") + + with tf.variable_scope(decompress_name, reuse=reuse): + # Decompress. + z = tf.layers.dense(z, hparams.hidden_size, name="z_to_dense") + + for i in xrange(hparams.num_compress_steps): + j = hparams.num_compress_steps - i - 1 + z = residual_conv(z, 1, hparams, "decompress_rc_%d" % j) + z = decompress_step(z, hparams, i > 0, "decompress__step_%d" % j) + return z, kl_loss, mu, log_sigma + + +def encode(x, x_space, hparams, name): + """Transformer preparations and encoder.""" + with tf.variable_scope(name): + (encoder_input, encoder_self_attention_bias, + _) = transformer.transformer_prepare_encoder(x, x_space, hparams) + encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.dropout) + return transformer.transformer_encoder( + encoder_input, encoder_self_attention_bias, hparams) + - cur, kl_loss = vae(cur, hparams, name="vae") - return cur, kl_loss +def dropmask(targets, targets_dropout_max, is_training): + if not is_training: + return targets + targets_drop_prob = tf.random_uniform([]) * targets_dropout_max + drop_mask = tf.random_uniform(tf.shape(targets)[:-1]) + drop_mask = tf.to_float(tf.less(drop_mask, targets_drop_prob)) + keep_mask = tf.expand_dims(1.0 - drop_mask, axis=2) + return targets * keep_mask def vae_transformer_internal(inputs, targets, target_space, hparams): @@ -78,67 +130,46 @@ def vae_transformer_internal(inputs, targets, target_space, hparams): is_training = hparams.mode == tf.contrib.learn.ModeKeys.TRAIN # Prepare inputs, targets, and k. inputs = common_layers.flatten4d3d(inputs) + input_len = tf.shape(inputs)[1] # Double input size to cover targets. + inputs = tf.pad(inputs, [[0, 0], [0, input_len], [0, 0]]) + inputs.set_shape([None, None, hparams.hidden_size]) targets = common_layers.flatten4d3d(targets) k = 2**hparams.num_compress_steps - _, targets = common_layers.pad_to_same_length( + inputs, targets = common_layers.pad_to_same_length( inputs, targets, final_length_divisible_by=k) - - # Transformer preparations and encoder. - (encoder_input, encoder_self_attention_bias, - encoder_decoder_attention_bias) = transformer.transformer_prepare_encoder( - inputs, target_space, hparams) - residual_fn = transformer.get_residual_fn(hparams) - encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) - encoder_output = transformer.transformer_encoder( - encoder_input, residual_fn, encoder_self_attention_bias, hparams) - - def get_decoder_autoregressive(): - """Decoder input for autoregressive computation.""" - (a, b) = transformer.transformer_prepare_decoder(targets, hparams) - return (a, b, tf.constant(0.0)) - - # 10% of the time we compress all-zeros, as will be at decoding start. - prob_targets = 0.9 if is_training else 1.0 - to_compress = tf.cond(tf.less(tf.random_uniform([]), prob_targets), - lambda: targets, lambda: tf.zeros_like(targets)) - z, kl_loss = compress_vae(to_compress, hparams, "vae") - # Decompress. - for i in xrange(hparams.num_compress_steps): - j = hparams.num_hidden_layers - i - 1 - z = decompress(z, hparams, "decompress_%d" % j) - - def get_decoder_from_vae(): - """Decoder input computed by VAE.""" - # Return decoder stuff. - (a, b) = transformer.transformer_prepare_decoder( - tf.squeeze(z, axis=2), hparams) - return (a, b, kl_loss) + inputs = encode(inputs, target_space, hparams, "input_enc") + + # Dropout targets or swap for zeros 5% of the time. + max_prestep = hparams.kl_warmup_steps + prob_targets = 0.95 if is_training else 1.0 + targets_dropout_max = common_layers.inverse_lin_decay(max_prestep) - 0.01 + targets = dropmask(targets, targets_dropout_max * 0.7, is_training) + targets = tf.cond(tf.less(tf.random_uniform([]), prob_targets), + lambda: targets, lambda: tf.zeros_like(targets)) + + # Join targets with inputs, run encoder. + # to_encode = common_layers.conv_block( + # tf.expand_dims(tf.concat([targets, inputs], axis=2), axis=2), + # hparams.hidden_size, [((1, 1), (1, 1))], + # first_relu=False, name="join_targets") + # to_compress = encode(tf.squeeze(to_encode, axis=2), + # target_space, hparams, "enc") + + # Compress and vae. + z, kl_loss, _, _ = vae_compress(tf.expand_dims(targets, axis=2), hparams, + "vae_compress", "vae_decompress") + + # Join z with inputs, run decoder. + to_decode = common_layers.conv_block( + tf.concat([z, tf.expand_dims(inputs, axis=2)], axis=3), + hparams.hidden_size, [((1, 1), (1, 1))], name="join_z") + ret = encode(tf.squeeze(to_decode, axis=2), target_space, hparams, "dec") + # to_decode = residual_conv(to_decode, 2, hparams, "dec_conv") + # ret = tf.squeeze(to_decode, axis=2) # Randomize decoder inputs.. - prob_do_vae = common_layers.inverse_exp_decay(40000) * 0.7 - step = tf.to_float(tf.contrib.framework.get_global_step()) - if not is_training: - prob_do_vae = tf.cond(tf.less(step, 40000.0), lambda: tf.constant(0.0), - lambda: tf.constant(1.0)) - (decoder_input, decoder_self_attention_bias, kl_loss2) = tf.cond( - tf.less(tf.random_uniform([]), prob_do_vae), - get_decoder_from_vae, get_decoder_autoregressive) - - # Transformer decoder. - decoder_output = transformer.transformer_decoder( - decoder_input, encoder_output, residual_fn, decoder_self_attention_bias, - encoder_decoder_attention_bias, hparams) - decoder_output = tf.expand_dims(decoder_output, 2) - - cond_self = tf.cond(tf.less(step, 30000.0), lambda: tf.constant(1.0), - lambda: tf.constant(0.0)) - prob_self = 0.4 if is_training else cond_self - (ret, kl_loss) = tf.cond(tf.less(tf.random_uniform([]), prob_self), - lambda: (z, kl_loss), - lambda: (decoder_output, kl_loss2)) - - kl_loss *= common_layers.inverse_exp_decay(50000) * 2.0 - return ret, kl_loss + kl_loss *= common_layers.inverse_exp_decay(max_prestep) * 10.0 + return tf.expand_dims(ret, axis=2), kl_loss @registry.register_model @@ -171,6 +202,15 @@ def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1, features, False, last_position_only=last_position_only) sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4) samples = tf.concat(sharded_samples, 0) + + # 2nd step. + with tf.variable_scope(tf.get_variable_scope(), reuse=True): + features["targets"] = samples + sharded_logits, _ = self.model_fn( + features, False, last_position_only=last_position_only) + sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4) + samples = tf.concat(sharded_samples, 0) + if inputs_old is not None: # Restore to not confuse Estimator. features["inputs"] = inputs_old return samples @@ -180,6 +220,22 @@ def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1, def transformer_vae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() + hparams.batch_size = 2048 hparams.add_hparam("z_size", 128) hparams.add_hparam("num_compress_steps", 4) + hparams.add_hparam("kl_warmup_steps", 50000) + return hparams + + +@registry.register_hparams +def transformer_vae_base(): + """Set of hyperparameters.""" + hparams = transformer_vae_small() + hparams.hidden_size = 512 + hparams.filter_size = 2048 + hparams.attention_dropout = 0.1 + hparams.relu_dropout = 0.1 + hparams.dropout = 0.1 + hparams.num_hidden_layers = 4 + hparams.z_size = 256 return hparams diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index 05aa9bf26..5c7041014 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -27,6 +27,7 @@ from six.moves import zip # pylint: disable=redefined-builtin from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.data_generators.problem import preprocess_examples_common from tensor2tensor.utils import registry import tensorflow as tf @@ -233,6 +234,7 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams): data_items_to_decoders=data_items_to_decoders) if problem is None: + examples = preprocess_examples_common(examples, hparams) examples = preprocessing(examples, data_file_pattern) else: examples = problem.preprocess_examples(examples, mode, hparams) diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py index 12057d8e6..5e8f4d482 100644 --- a/tensor2tensor/utils/decoding.py +++ b/tensor2tensor/utils/decoding.py @@ -45,13 +45,13 @@ def decode_from_dataset(estimator): tf.logging.info("Performing local inference.") infer_problems_data = data_reader.get_data_filepatterns( FLAGS.problems, hparams.data_dir, tf.contrib.learn.ModeKeys.INFER) + infer_input_fn = input_fn_builder.build_input_fn( mode=tf.contrib.learn.ModeKeys.INFER, hparams=hparams, data_file_patterns=infer_problems_data, num_datashards=devices.data_parallelism().n, fixed_problem=i) - result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False) def log_fn(inputs, targets, @@ -66,14 +66,21 @@ def log_fn(inputs, "%s_prediction_%d.jpg" % (problem, j)) show_and_save_image(inputs / 255., save_path) elif inputs_vocab: - decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten())) + decoded_inputs = inputs_vocab.decode( + _save_until_eos(inputs.flatten())) tf.logging.info("Inference results INPUT: %s" % decoded_inputs) - decoded_outputs = targets_vocab.decode(_save_until_eos(outputs.flatten())) + if FLAGS.identity_output: + decoded_outputs = " ".join(map(str, outputs.flatten())) + decoded_targets = " ".join(map(str, targets.flatten())) + else: + decoded_outputs = targets_vocab.decode( + _save_until_eos(outputs.flatten())) + decoded_targets = targets_vocab.decode( + _save_until_eos(targets.flatten())) + tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) - decoded_targets = targets_vocab.decode(_save_until_eos(targets.flatten())) tf.logging.info("Inference results TARGET: %s" % decoded_targets) - if FLAGS.decode_to_file: output_filepath = FLAGS.decode_to_file + ".outputs." + problem output_file = tf.gfile.Open(output_filepath, "a") @@ -81,21 +88,25 @@ def log_fn(inputs, target_filepath = FLAGS.decode_to_file + ".targets." + problem target_file = tf.gfile.Open(target_filepath, "a") target_file.write(decoded_targets + "\n") - - # The function predict() returns an iterable over the network's - # predictions from the test input. We use it to log inputs and decodes. - inputs_iter = result_iter["inputs"] - targets_iter = result_iter["targets"] - outputs_iter = result_iter["outputs"] - for j, result in enumerate(zip(inputs_iter, targets_iter, outputs_iter)): - inputs, targets, outputs = result + result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=True) + count = 0 + for result in result_iter: + # predictions from the test input. We use it to log inputs and decodes. + inputs = result["inputs"] + targets = result["targets"] + outputs = result["outputs"] if FLAGS.decode_return_beams: output_beams = np.split(outputs, FLAGS.decode_beam_size, axis=0) for k, beam in enumerate(output_beams): tf.logging.info("BEAM %d:" % k) - log_fn(inputs, targets, beam, problem, j) + log_fn(inputs, targets, beam, problem, count) else: - log_fn(inputs, targets, outputs, problem, j) + log_fn(inputs, targets, outputs, problem, count) + + count += 1 + if FLAGS.decode_num_samples != -1 and count >= FLAGS.decode_num_samples: + break + tf.logging.info("Completed inference on %d samples." % count) def decode_from_file(estimator, filename): diff --git a/tensor2tensor/utils/devices.py b/tensor2tensor/utils/devices.py index 4f76367e9..d04b73563 100644 --- a/tensor2tensor/utils/devices.py +++ b/tensor2tensor/utils/devices.py @@ -112,7 +112,7 @@ def _replica_device_setter(worker_device): if FLAGS.schedule == "local_run": assert not FLAGS.sync datashard_devices = ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)] - if FLAGS.locally_shard_to_cpu: + if FLAGS.locally_shard_to_cpu or FLAGS.worker_gpu < 1: datashard_devices += ["cpu:0"] caching_devices = None elif FLAGS.sync: diff --git a/tensor2tensor/utils/input_fn_builder.py b/tensor2tensor/utils/input_fn_builder.py index 79a765ca2..d1b68aa02 100644 --- a/tensor2tensor/utils/input_fn_builder.py +++ b/tensor2tensor/utils/input_fn_builder.py @@ -137,10 +137,6 @@ def input_fn(): tf.get_variable( "problem_%d/total_loss" % n, initializer=100.0, trainable=False)) - tf.get_variable( - "problem_%d/training_loss" % n, initializer=100.0, trainable=False) - tf.get_variable( - "problem_%d/extra_loss" % n, initializer=100.0, trainable=False) if fixed_problem is None: if (hparams.problem_choice == "uniform" or mode != tf.contrib.learn.ModeKeys.TRAIN): diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py index db60e07c8..fd82adc30 100644 --- a/tensor2tensor/utils/metrics.py +++ b/tensor2tensor/utils/metrics.py @@ -24,6 +24,7 @@ from tensor2tensor.layers import common_layers from tensor2tensor.utils import bleu_hook +from tensor2tensor.utils import rouge import tensorflow as tf @@ -37,9 +38,13 @@ class Metrics(object): NEG_LOG_PERPLEXITY = "neg_log_perplexity" APPROX_BLEU = "approx_bleu_score" RMSE = "rmse" + LOG_POISSON = "log_poisson" + R2 = "r_squared" + ROUGE_2_F = "rouge_2_fscore" + ROUGE_L_F = "rouge_L_fscore" -def padded_rmse(predictions, labels, weights_fn=common_layers.weights_nonzero): +def padded_rmse(predictions, labels, weights_fn=common_layers.weights_all): predictions, labels = common_layers.pad_with_zeros(predictions, labels) targets = labels weights = weights_fn(targets) @@ -47,6 +52,33 @@ def padded_rmse(predictions, labels, weights_fn=common_layers.weights_nonzero): return tf.reduce_sum(error * weights), tf.reduce_sum(weights) +def padded_log_poisson(predictions, + labels, + weights_fn=common_layers.weights_all): + # Expects predictions to already be transformed into log space + predictions, labels = common_layers.pad_with_zeros(predictions, labels) + targets = labels + weights = weights_fn(targets) + + lp_loss = tf.nn.log_poisson_loss(targets, predictions, compute_full_loss=True) + return tf.reduce_sum(lp_loss * weights), tf.reduce_sum(weights) + + +def padded_variance_explained(predictions, + labels, + weights_fn=common_layers.weights_all): + # aka R^2 + predictions, labels = common_layers.pad_with_zeros(predictions, labels) + targets = labels + weights = weights_fn(targets) + + y_bar = tf.reduce_mean(weights * targets) + tot_ss = tf.reduce_sum(weights * tf.pow(targets - y_bar, 2)) + res_ss = tf.reduce_sum(weights * tf.pow(targets - predictions, 2)) + r2 = 1. - res_ss / tot_ss + return r2, tf.reduce_sum(weights) + + def padded_accuracy_topk(predictions, labels, k, @@ -112,11 +144,12 @@ def padded_accuracy(predictions, return tf.to_float(tf.equal(outputs, padded_labels)), weights -def create_evaluation_metrics(problems): +def create_evaluation_metrics(problems, model_hparams): """Creates the evaluation metrics for the model. Args: problems: List of tuples (problem name, problem instance). + model_hparams: a set of hparams. Returns: A dictionary with keys that are strings naming the evaluation @@ -162,8 +195,14 @@ def problem_metric_fn(predictions, labels, weights): (problem_name, metrics, METRICS_FNS.keys())) class_output = "image" in problem_name and "coco" not in problem_name - weights_fn = (common_layers.weights_all - if class_output else common_layers.weights_nonzero) + real_output = "gene_expression" in problem_name + if model_hparams.prepend_inputs_to_targets: + assert not class_output + weights_fn = common_layers.weights_second_part + elif class_output or real_output: + weights_fn = common_layers.weights_all + else: + weights_fn = common_layers.weights_nonzero for metric in metrics: metric_fn = METRICS_FNS[metric] @@ -188,4 +227,8 @@ def problem_metric_fn(predictions, labels, weights): Metrics.NEG_LOG_PERPLEXITY: padded_neg_log_perplexity, Metrics.APPROX_BLEU: bleu_hook.bleu_score, Metrics.RMSE: padded_rmse, + Metrics.LOG_POISSON: padded_log_poisson, + Metrics.R2: padded_variance_explained, + Metrics.ROUGE_2_F: rouge.rouge_2_fscore, + Metrics.ROUGE_L_F: rouge.rouge_l_fscore, } diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py index 48d5dd7a0..da33e1e40 100644 --- a/tensor2tensor/utils/model_builder.py +++ b/tensor2tensor/utils/model_builder.py @@ -166,6 +166,7 @@ def model_fn(features, targets, mode): train = mode == tf.contrib.learn.ModeKeys.TRAIN # Get multi-problem logits and loss based on features["problem_choice"]. + loss_variable_names = [] def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( @@ -191,17 +192,25 @@ def nth_model(n): # On worker 0 also build graph for problems <= 1. # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1) - sharded_logits, losses_dict = model_class.model_fn( - features, skip=(skipping_is_on and skip_this_one)) - with tf.variable_scope("losses_avg", reuse=True): + if (FLAGS.eval_run_autoregressive and + mode == tf.contrib.learn.ModeKeys.EVAL): + sharded_logits, losses_dict = model_class.eval_autoregressive(features) + else: + sharded_logits, losses_dict = model_class.model_fn( + features, skip=(skipping_is_on and skip_this_one)) + with tf.variable_scope("losses_avg"): total_loss, ops = 0.0, [] for loss_key, loss_value in six.iteritems(losses_dict): - loss_moving_avg = tf.get_variable("problem_%d/%s_loss" % (n, - loss_key)) + loss_name = "problem_%d/%s_loss" % (n, loss_key) + loss_moving_avg = tf.get_variable( + loss_name, initializer=100.0, trainable=False) + loss_variable_names.append(loss_name) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1)) total_loss += loss_value - loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) + with tf.variable_scope(tf.get_variable_scope(), reuse=True): + # Total loss was already constructed on input. + loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)) with tf.variable_scope("train_stats"): # Count steps for this problem. @@ -256,13 +265,18 @@ def nth_model(n): tf.summary.scalar("learning_rate", learning_rate) global_step = tf.to_float(tf.contrib.framework.get_global_step()) for n in xrange(len(my_hp.problems)): + names_and_vars = [] with tf.variable_scope("losses_avg", reuse=True): total_loss_var = tf.get_variable("problem_%d/total_loss" % n) - training_loss_var = tf.get_variable("problem_%d/training_loss" % n) - extra_loss_var = tf.get_variable("problem_%d/extra_loss" % n) - tf.summary.scalar("loss_avg_%d/total_loss" % n, total_loss_var) - tf.summary.scalar("loss_avg_%d/training_loss" % n, training_loss_var) - tf.summary.scalar("loss_avg_%d/extra_loss" % n, extra_loss_var) + names_and_vars.append(("total_loss", total_loss_var)) + with tf.variable_scope("losses_avg", reuse=True): + for loss_name in loss_variable_names: + if loss_name.startswith("problem_%d/" % n): + loss_var = tf.get_variable(loss_name) + loss_suffix = loss_name[loss_name.index("/") + 1:] + names_and_vars.append((loss_suffix, loss_var)) + for (loss_name, loss_var) in names_and_vars: + tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var) with tf.variable_scope("train_stats", reuse=True): nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32) tf.summary.scalar("problem_%d_frequency" % n, diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py index 5402e5bde..fea647b2b 100644 --- a/tensor2tensor/utils/registry.py +++ b/tensor2tensor/utils/registry.py @@ -64,6 +64,7 @@ class Modalities(object): AUDIO = "audio" CLASS_LABEL = "class_label" GENERIC = "generic" + REAL = "real" _MODALITIES = { @@ -72,6 +73,7 @@ class Modalities(object): Modalities.AUDIO: {}, Modalities.CLASS_LABEL: {}, Modalities.GENERIC: {}, + Modalities.REAL: {}, } # Camel case to snake case utils @@ -277,6 +279,11 @@ def class_label_modality(name=None): Modalities.CLASS_LABEL.capitalize()) +def real_modality(name=None): + return _internal_get_modality(name, _MODALITIES[Modalities.REAL], + Modalities.REAL.capitalize()) + + def _internal_register_modality(name, mod_collection, collection_str): """Register a modality into mod_collection.""" @@ -309,6 +316,12 @@ def register_generic_modality(name=None): Modalities.GENERIC.capitalize()) +def register_real_modality(name=None): + """Register a real modality. name defaults to class name snake-cased.""" + return _internal_register_modality(name, _MODALITIES[Modalities.REAL], + Modalities.REAL.capitalize()) + + def register_audio_modality(name=None): """Register an audio modality. name defaults to class name snake-cased.""" return _internal_register_modality(name, _MODALITIES[Modalities.AUDIO], @@ -366,6 +379,7 @@ def create_modality(modality_spec, model_hparams): Modalities.IMAGE: image_modality, Modalities.CLASS_LABEL: class_label_modality, Modalities.GENERIC: generic_modality, + Modalities.REAL: real_modality, } modality_full_name, vocab_size = modality_spec diff --git a/tensor2tensor/utils/rouge.py b/tensor2tensor/utils/rouge.py new file mode 100644 index 000000000..29c84729f --- /dev/null +++ b/tensor2tensor/utils/rouge.py @@ -0,0 +1,249 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ROUGe metric implementation. + +This is a modified and slightly extended verison of +https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + +import tensorflow as tf + + +def _len_lcs(x, y): + """Returns the length of the Longest Common Subsequence between two seqs. + + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + Args: + x: sequence of words + y: sequence of words + + Returns + integer: Length of LCS between x and y + """ + table = _lcs(x, y) + n, m = len(x), len(y) + return table[n, m] + + +def _lcs(x, y): + """Computes the length of the LCS between two seqs. + + The implementation below uses a DP programming algorithm and runs + in O(nm) time where n = len(x) and m = len(y). + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + Args: + x: collection of words + y: collection of words + + Returns: + Table of dictionary of coord and len lcs + """ + n, m = len(x), len(y) + table = dict() + for i in range(n + 1): + for j in range(m + 1): + if i == 0 or j == 0: + table[i, j] = 0 + elif x[i - 1] == y[j - 1]: + table[i, j] = table[i - 1, j - 1] + 1 + else: + table[i, j] = max(table[i - 1, j], table[i, j - 1]) + return table + + +def _f_lcs(llcs, m, n): + """Computes the LCS-based F-measure score. + + Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Args: + llcs: Length of LCS + m: number of words in reference summary + n: number of words in candidate summary + + Returns: + Float. LCS-based F-measure score + """ + r_lcs = llcs / m + p_lcs = llcs / n + beta = p_lcs / (r_lcs + 1e-12) + num = (1 + (beta**2)) * r_lcs * p_lcs + denom = r_lcs + ((beta**2) * p_lcs) + f_lcs = num / (denom + 1e-12) + return f_lcs + + +def rouge_l_sentence_level(eval_sentences, ref_sentences): + """Computes ROUGE-L (sentence level) of two collections of sentences. + + Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Calculated according to: + R_lcs = LCS(X,Y)/m + P_lcs = LCS(X,Y)/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + + where: + X = reference summary + Y = Candidate summary + m = length of reference summary + n = length of candidate summary + + Args: + eval_sentences: The sentences that have been picked by the summarizer + ref_sentences: The sentences from the referene set + + Returns: + A float: F_lcs + """ + + f1_scores = [] + for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences): + m = len(ref_sentence) + n = len(eval_sentence) + lcs = _len_lcs(eval_sentence, ref_sentence) + f1_scores.append(_f_lcs(lcs, m, n)) + return np.mean(f1_scores, dtype=np.float32) + + +def rouge_l_fscore(predictions, labels, **unused_kwargs): + """ROUGE scores computation between labels and predictions. + + This is an approximate ROUGE scoring method since we do not glue word pieces + or decode the ids and tokenize the output. + + Args: + predictions: tensor, model predicitons + labels: tensor, gold output. + + Returns: + rouge_l_fscore: approx rouge-l f1 score. + """ + outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) + # Convert the outputs and labels to a [batch_size, input_length] tensor. + outputs = tf.squeeze(outputs, axis=[-1, -2]) + labels = tf.squeeze(labels, axis=[-1, -2]) + rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (labels, outputs), + tf.float32) + return rouge_l_f_score, tf.constant(1.0) + + +def _get_ngrams(n, text): + """Calcualtes n-grams. + + Args: + n: which n-grams to calculate + text: An array of tokens + + Returns: + A set of n-grams + """ + ngram_set = set() + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range(max_index_ngram_start + 1): + ngram_set.add(tuple(text[i:i + n])) + return ngram_set + + +def rouge_n(eval_sentences, ref_sentences, n=2): + """Computes ROUGE-N f1 score of two text collections of sentences. + + Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ + papers/rouge-working-note-v1.3.1.pdf + + Args: + eval_sentences: The sentences that have been picked by the summarizer + ref_sentences: The sentences from the reference set + n: Size of ngram. Defaults to 2. + + Returns: + f1 score for ROUGE-N + """ + + f1_scores = [] + for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences): + eval_ngrams = _get_ngrams(n, eval_sentence) + ref_ngrams = _get_ngrams(n, ref_sentence) + ref_count = len(ref_ngrams) + eval_count = len(eval_ngrams) + + # Gets the overlapping ngrams between evaluated and reference + overlapping_ngrams = eval_ngrams.intersection(ref_ngrams) + overlapping_count = len(overlapping_ngrams) + + # Handle edge case. This isn't mathematically correct, but it's good enough + if eval_count == 0: + precision = 0.0 + else: + precision = overlapping_count / eval_count + + if ref_count == 0: + recall = 0.0 + else: + recall = overlapping_count / ref_count + + f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8))) + + # return overlapping_count / reference_count + return np.mean(f1_scores, dtype=np.float32) + + +def rouge_2_fscore(predictions, labels, **unused_kwargs): + """ROUGE-2 F1 score computation between labels and predictions. + + This is an approximate ROUGE scoring method since we do not glue word pieces + or decode the ids and tokenize the output. + + Args: + predictions: tensor, model predicitons + labels: tensor, gold output. + + Returns: + rouge2_fscore: approx rouge-2 f1 score. + """ + + outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) + # Convert the outputs and labels to a [batch_size, input_length] tensor. + outputs = tf.squeeze(outputs, axis=[-1, -2]) + labels = tf.squeeze(labels, axis=[-1, -2]) + rouge_2_f_score = tf.py_func(rouge_n, (labels, outputs), tf.float32) + return rouge_2_f_score, tf.constant(1.0) diff --git a/tensor2tensor/utils/rouge_test.py b/tensor2tensor/utils/rouge_test.py new file mode 100644 index 000000000..2a8c260e2 --- /dev/null +++ b/tensor2tensor/utils/rouge_test.py @@ -0,0 +1,120 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Rouge metric.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np +from tensor2tensor.utils import rouge + +import tensorflow as tf + + +class TestRouge2Metric(tf.test.TestCase): + """Tests the rouge-2 metric.""" + + def testRouge2Identical(self): + hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0], + [1, 2, 3, 4, 5, 1, 6, 8, 7]]) + references = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0], + [1, 2, 3, 4, 5, 1, 6, 8, 7]]) + self.assertAllClose(rouge.rouge_n(hypotheses, references), 1.0, atol=1e-03) + + def testRouge2Disjoint(self): + hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0], + [1, 2, 3, 4, 5, 1, 6, 8, 7]]) + references = np.array([[8, 9, 10, 11, 12, 13, 14, 15, 16, 17], + [9, 10, 11, 12, 13, 14, 15, 16, 17, 0]]) + self.assertEqual(rouge.rouge_n(hypotheses, references), 0.0) + + def testRouge2PartialOverlap(self): + hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0], + [1, 2, 3, 4, 5, 1, 6, 8, 7]]) + references = np.array([[1, 9, 2, 3, 4, 5, 1, 10, 6, 7], + [1, 9, 2, 3, 4, 5, 1, 10, 6, 7]]) + self.assertAllClose(rouge.rouge_n(hypotheses, references), 0.53, atol=1e-03) + + +class TestRougeLMetric(tf.test.TestCase): + """Tests the rouge-l metric.""" + + def testRougeLIdentical(self): + hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0], + [1, 2, 3, 4, 5, 1, 6, 8, 7]]) + references = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0], + [1, 2, 3, 4, 5, 1, 6, 8, 7]]) + self.assertAllClose( + rouge.rouge_l_sentence_level(hypotheses, references), 1.0, atol=1e-03) + + def testRougeLDisjoint(self): + hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0], + [1, 2, 3, 4, 5, 1, 6, 8, 7]]) + references = np.array([[8, 9, 10, 11, 12, 13, 14, 15, 16, 17], + [9, 10, 11, 12, 13, 14, 15, 16, 17, 0]]) + self.assertEqual(rouge.rouge_l_sentence_level(hypotheses, references), 0.0) + + def testRougeLPartialOverlap(self): + hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0], + [1, 2, 3, 4, 5, 1, 6, 8, 7]]) + references = np.array([[1, 9, 2, 3, 4, 5, 1, 10, 6, 7], + [1, 9, 2, 3, 4, 5, 1, 10, 6, 7]]) + self.assertAllClose( + rouge.rouge_l_sentence_level(hypotheses, references), 0.837, atol=1e-03) + + +class TestRougeMetricsE2E(tf.test.TestCase): + """Tests the rouge metrics end-to-end.""" + + def testRouge2MetricE2E(self): + vocab_size = 4 + batch_size = 12 + seq_length = 12 + predictions = tf.one_hot( + np.random.randint(vocab_size, size=(batch_size, seq_length, 1, 1)), + depth=4, + dtype=tf.float32) + targets = np.random.randint(4, size=(12, 12, 1, 1)) + with self.test_session() as session: + scores, _ = rouge.rouge_2_fscore(predictions, + tf.constant(targets, dtype=tf.int32)) + a = tf.reduce_mean(scores) + session.run(tf.global_variables_initializer()) + session.run(a) + + def testRougeLMetricE2E(self): + vocab_size = 4 + batch_size = 12 + seq_length = 12 + predictions = tf.one_hot( + np.random.randint(vocab_size, size=(batch_size, seq_length, 1, 1)), + depth=4, + dtype=tf.float32) + targets = np.random.randint(4, size=(12, 12, 1, 1)) + with self.test_session() as session: + scores, _ = rouge.rouge_l_fscore( + predictions, + tf.constant(targets, dtype=tf.int32)) + a = tf.reduce_mean(scores) + session.run(tf.global_variables_initializer()) + session.run(a) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 3af4f10c1..8fcf2482d 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -85,11 +85,21 @@ def __init__(self, ps_devices = [""] hparams = copy.copy(hparams) hparams.add_hparam("mode", mode) - # when not in training mode, set all forms of dropout to zero. + # When not in training mode, set all forms of dropout to zero. if mode != tf.contrib.learn.ModeKeys.TRAIN: for key in hparams.values(): if key[-len("dropout"):] == "dropout": setattr(hparams, key, 0.0) + # If vocabularies differ, unset shared_embedding_and_softmax_weights. + if hparams.shared_embedding_and_softmax_weights: + same_vocab_sizes = True + for problem in hparams.problems: + if "inputs" in problem.input_modality: + if problem.input_modality["inputs"] != problem.target_modality: + same_vocab_sizes = False + if not same_vocab_sizes: + tf.logging.info("Unsetting shared_embedding_and_softmax_weights.") + hparams.shared_embedding_and_softmax_weights = 0 self._hparams = hparams self._data_parallelism = data_parallelism self._num_datashards = data_parallelism.n @@ -134,6 +144,30 @@ def _create_modalities(self, problem_hparams, hparams): def has_input(self): return self._problem_hparams.input_modality + def eval_autoregressive(self, + features=None, + decode_length=50, + last_position_only=False): + """Autoregressive eval. + + Quadratic time in decode_length. + + Args: + features: an map of string to `Tensor` + decode_length: an integer. How many additional timesteps to decode. + last_position_only: a boolean, speed-up by computing last position only. + + Returns: + sharded_logits: a list of `Tensor`s. Assumes one datashard. + losses: a dictionary: {loss-name (string): floating point `Scalar`}. + Contains a single key "training". + """ + _, logits, losses = self._greedy_infer( + features, + decode_length=decode_length, + last_position_only=last_position_only) + return [logits], losses + def infer(self, features=None, decode_length=50, @@ -169,11 +203,13 @@ def infer(self, beam_size = 1 # No use to run beam-search for a single class. if beam_size == 1: tf.logging.info("Greedy Decoding") - return self._greedy_infer(features, decode_length, last_position_only) + samples, _, _ = self._greedy_infer(features, decode_length, + last_position_only) else: tf.logging.info("Beam Decoding with beam size %d" % beam_size) - return self._beam_decode(features, decode_length, beam_size, top_beams, - last_position_only, alpha) + samples = self._beam_decode(features, decode_length, beam_size, top_beams, + last_position_only, alpha) + return samples def _beam_decode(self, features, decode_length, beam_size, top_beams, last_position_only, alpha): @@ -258,6 +294,8 @@ def _greedy_infer(self, features, decode_length, last_position_only): Returns: samples: an integer `Tensor`. + logits: `Tensor` of shape [batch_size, time, 1, 1, vocab_size]. + losses: a dictionary: {loss-name (string): floating point `Scalar`} """ if not features: features = {} @@ -268,14 +306,15 @@ def _greedy_infer(self, features, decode_length, last_position_only): if not self.has_input: features["partial_targets"] = tf.to_int64(features["inputs"]) - def infer_step(recent_output, _): + def infer_step(recent_output, recent_logits, unused_loss): """Inference step.""" recent_output.set_shape([None, None, None, 1]) padded = tf.pad(recent_output, [[0, 0], [0, 1], [0, 0], [0, 0]]) features["targets"] = padded # This is inefficient in that it generates samples at all timesteps, # not just the last one, except if last_position_only is set (dangerous). - samples = self.sample(features, last_position_only=last_position_only) + samples, logits, losses = self.sample( + features, last_position_only=last_position_only) # Concatenate the already-generated recent_output with last timestep # of the newly-generated samples. if last_position_only: @@ -285,7 +324,11 @@ def infer_step(recent_output, _): cur_sample = tf.to_int64(tf.expand_dims(cur_sample, axis=1)) samples = tf.concat([recent_output, cur_sample], axis=1) samples.set_shape([None, None, None, 1]) - return samples + + # Assuming we have one shard for logits. + logits = tf.concat([recent_logits, logits[0][:, -1:]], 1) + loss = sum(losses.values()) + return samples, logits, loss # Create an initial output tensor. This will be passed # to the infer_step, which adds one timestep at every iteration. @@ -298,20 +341,32 @@ def infer_step(recent_output, _): # input shape, so we confuse it about the input shape. initial_output = tf.slice(initial_output, [0, 0, 0, 0], tf.shape(initial_output)) - if _is_class_modality( - self._hparams.problems[self._problem_idx].target_modality): + target_modality = self._hparams.problems[self._problem_idx].target_modality + if _is_class_modality(target_modality): decode_length = 1 else: decode_length = tf.shape(features["inputs"])[1] + decode_length - result = tf.foldl( - infer_step, - tf.range(decode_length), - initializer=initial_output, + # Initial values of result, logits and loss. + result = initial_output + # tensor of shape [batch_size, time, 1, 1, vocab_size] + logits = tf.zeros((batch_size, 0, 1, 1, target_modality.top_dimensionality)) + logits.set_shape([None, None, None, None, None]) + loss = 0.0 + + result, logits, loss = tf.while_loop( + lambda result, logits, loss: tf.shape(result)[1] < decode_length, + infer_step, [result, logits, loss], + shape_invariants=[ + tf.TensorShape([None, None, None, None]), + tf.TensorShape([None, None, None, None, None]), + tf.TensorShape([]), + ], back_prop=False, parallel_iterations=1) if inputs_old is not None: # Restore to not confuse Estimator. features["inputs"] = inputs_old - return result + losses = {"training": loss} + return result, logits, losses def sample(self, features, last_position_only=False): """Run the model and extract samples. @@ -322,8 +377,10 @@ def sample(self, features, last_position_only=False): Returns: samples: an integer `Tensor`. + logits: a list of `Tensor`s, one per datashard. + losses: a dictionary: {loss-name (string): floating point `Scalar`}. """ - sharded_logits, _ = self.model_fn( + sharded_logits, losses = self.model_fn( features, False, last_position_only=last_position_only) if self._hparams.sampling_method == "argmax": sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4) @@ -339,7 +396,7 @@ def _multinomial_squeeze(logits): sharded_samples = self._data_parallelism(_multinomial_squeeze, sharded_logits) - return tf.concat(sharded_samples, 0) + return tf.concat(sharded_samples, 0), sharded_logits, losses def _shard_features(self, features): # pylint: disable=missing-docstring sharded_features = dict() @@ -415,7 +472,7 @@ def model_fn(self, features, skip=False, last_position_only=False): else: body_outputs, losses = self.model_fn_body_sharded( transformed_features) - if isinstance(losses, tf.Tensor): # If it's a single extra loss. + if not isinstance(losses, dict): # If it's a single extra loss. losses = {"extra": losses} with tf.variable_scope(target_modality.name, reuse=target_reuse): @@ -469,10 +526,14 @@ def model_fn_body_sharded(self, sharded_features): _with_timing(self.model_fn_body, "model_fn_body"), datashard_to_features) if isinstance(output, tuple): - if isinstance(output[1], dict): - loss = output[1] + losses_sharded = output[1] + if isinstance(losses_sharded[0], dict): + loss = {} + for k in losses_sharded[0].keys(): + k_loss_sharded = [losses[k] for losses in losses_sharded] + loss[k] = tf.reduce_mean(k_loss_sharded) else: - loss = {"extra": tf.reduce_mean(output[1])} + loss = {"extra": tf.reduce_mean(losses_sharded)} output = output[0] else: loss = {"extra": 0.0} diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 9e869c15c..22fd727f9 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -63,6 +63,9 @@ "The number of steps to run training for.") flags.DEFINE_integer("eval_steps", 10, "Number of steps in evaluation.") flags.DEFINE_bool("eval_print", False, "Print eval logits and predictions.") +flags.DEFINE_bool("eval_run_autoregressive", False, + "Run eval autoregressively where we condition on previous" + "generated output instead of the actual target.") flags.DEFINE_integer("keep_checkpoint_max", 20, "How many recent checkpoints to keep.") flags.DEFINE_bool("experimental_optimize_placement", False, @@ -118,6 +121,9 @@ flags.DEFINE_integer("decode_max_input_size", -1, "Maximum number of ids in input. Or <= 0 for no max.") flags.DEFINE_bool("identity_output", False, "To print the output as identity") +flags.DEFINE_integer("decode_num_samples", -1, + "Number of samples to decode. Currently used in" + "decode_from_dataset. Use -1 for all.") def make_experiment_fn(data_dir, model_name, train_steps, eval_steps): @@ -144,7 +150,7 @@ def create_experiment(output_dir, data_dir, model_name, train_steps, data_dir=data_dir, model_name=model_name) eval_metrics = metrics.create_evaluation_metrics( - zip(FLAGS.problems.split("-"), hparams.problem_instances)) + zip(FLAGS.problems.split("-"), hparams.problem_instances), hparams) if (hasattr(FLAGS, "autotune") and FLAGS.autotune and FLAGS.objective not in eval_metrics): raise ValueError("Tuning objective %s not among evaluation metrics %s" % @@ -219,10 +225,13 @@ def add_problem_hparams(hparams, problems): for problem_name in problems.split("-"): try: problem = registry.problem(problem_name) - p_hparams = problem.internal_hparams(hparams) except ValueError: problem = None + + if problem is None: p_hparams = problem_hparams.problem_hparams(problem_name, hparams) + else: + p_hparams = problem.internal_hparams(hparams) hparams.problem_instances.append(problem) hparams.problems.append(p_hparams)