diff --git a/README.md b/README.md index c0e34e0fe..edd6460d0 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,6 @@ mkdir -p $DATA_DIR $TMP_DIR $TRAIN_DIR t2t-datagen \ --data_dir=$DATA_DIR \ --tmp_dir=$TMP_DIR \ - --num_shards=100 \ --problem=$PROBLEM # Train diff --git a/setup.py b/setup.py index 9da5293b9..6be9aba04 100644 --- a/setup.py +++ b/setup.py @@ -5,13 +5,14 @@ setup( name='tensor2tensor', - version='1.1.1', + version='1.1.2', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', url='http://github.com/tensorflow/tensor2tensor', license='Apache 2.0', packages=find_packages(), + package_data={'tensor2tensor.data_generators': ['test_data/*']}, scripts=[ 'tensor2tensor/bin/t2t-trainer', 'tensor2tensor/bin/t2t-datagen', @@ -26,6 +27,8 @@ 'tensorflow': ['tensorflow>=1.2.0rc1'], 'tensorflow_gpu': ['tensorflow-gpu>=1.2.0rc1'], }, + tests_require=['nose'], + test_suite='nose.collector', classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', diff --git a/tensor2tensor/__init__.py b/tensor2tensor/__init__.py index eff6a2b14..3f714ce1f 100644 --- a/tensor2tensor/__init__.py +++ b/tensor2tensor/__init__.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index af5b47f8c..e4acb6731 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -62,10 +63,12 @@ flags.DEFINE_string("problem", "", "The name of the problem to generate data for.") flags.DEFINE_string("exclude_problems", "", "Comma-separates list of problems to exclude.") -flags.DEFINE_integer("num_shards", 10, "How many shards to use.") +flags.DEFINE_integer("num_shards", 0, "How many shards to use. Ignored for " + "registered Problems.") flags.DEFINE_integer("max_cases", 0, "Maximum number of cases to generate (unbounded if 0).") flags.DEFINE_integer("random_seed", 429459, "Random seed to use.") +flags.DEFINE_integer("task_id", -1, "For distributed data generation.") flags.DEFINE_string("t2t_usr_dir", "", "Path to a Python module that will be imported. The " "__init__.py file should include the necessary imports. " @@ -108,6 +111,10 @@ _SUPPORTED_PROBLEM_GENERATORS = { lambda: lm1b.generator(FLAGS.tmp_dir, True), lambda: lm1b.generator(FLAGS.tmp_dir, False) ), + "lm1b_characters": ( + lambda: lm1b.generator(FLAGS.tmp_dir, True, characters=True), + lambda: lm1b.generator(FLAGS.tmp_dir, False, characters=True) + ), "wiki_32k": ( lambda: wiki.generator(FLAGS.tmp_dir, True), 1000 @@ -246,7 +253,7 @@ def generate_data_for_problem(problem): if isinstance(dev_gen, int): # The dev set and test sets are generated as extra shards using the # training generator. The integer specifies the number of training - # shards. FLAGS.num_shards is ignored. + # shards. FLAGS.num_shards is ignored. num_training_shards = dev_gen tf.logging.info("Generating data for %s.", problem) all_output_files = generator_utils.combined_data_filenames( @@ -257,10 +264,11 @@ def generate_data_for_problem(problem): else: # usual case - train data and dev data are generated using separate # generators. + num_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, - FLAGS.num_shards) + num_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) tf.logging.info("Generating development data for %s.", problem) @@ -275,10 +283,14 @@ def generate_data_for_problem(problem): def generate_data_for_registered_problem(problem_name): + tf.logging.info("Generating training data for %s.", problem_name) + if FLAGS.num_shards: + raise ValueError("--num_shards should not be set for registered Problem.") problem = registry.problem(problem_name) + task_id = None if FLAGS.task_id < 0 else FLAGS.task_id problem.generate_data(os.path.expanduser(FLAGS.data_dir), os.path.expanduser(FLAGS.tmp_dir), - FLAGS.num_shards) + task_id=task_id) if __name__ == "__main__": diff --git a/tensor2tensor/bin/t2t-make-tf-configs b/tensor2tensor/bin/t2t-make-tf-configs index 6a4dc8641..0b656aba6 100644 --- a/tensor2tensor/bin/t2t-make-tf-configs +++ b/tensor2tensor/bin/t2t-make-tf-configs @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer index a37767258..13dd7d355 100644 --- a/tensor2tensor/bin/t2t-trainer +++ b/tensor2tensor/bin/t2t-trainer @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/__init__.py b/tensor2tensor/data_generators/__init__.py index eff6a2b14..3f714ce1f 100644 --- a/tensor2tensor/data_generators/__init__.py +++ b/tensor2tensor/data_generators/__init__.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py index 2169e1910..c115a1ebe 100644 --- a/tensor2tensor/data_generators/algorithmic.py +++ b/tensor2tensor/data_generators/algorithmic.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -65,10 +66,7 @@ def dev_size(self): def num_shards(self): return 10 - def generate_data(self, data_dir, _, num_shards=None): - if num_shards is None: - num_shards = self.num_shards - + def generate_data(self, data_dir, _, task_id=-1): def generator_eos(generator): """Shift by NUM_RESERVED_IDS and append EOS token.""" for case in generator: @@ -86,7 +84,7 @@ def generator_eos(generator): utils.generate_dataset_and_shuffle( train_generator_eos(), - self.training_filepaths(data_dir, num_shards, shuffled=True), + self.training_filepaths(data_dir, self.num_shards, shuffled=True), dev_generator_eos(), self.dev_filepaths(data_dir, 1, shuffled=True), shuffle=False) @@ -253,7 +251,7 @@ def zipf_distribution(nbr_symbols, alpha): def zipf_random_sample(distr_map, sample_len): - """Helper function: Generate a random Zipf sample of given lenght. + """Helper function: Generate a random Zipf sample of given length. Args: distr_map: list of float, Zipf's distribution over nbr_symbols. @@ -286,7 +284,7 @@ def reverse_generator_nlplike(nbr_symbols, max_length: integer, maximum length of sequences to generate. nbr_cases: the number of cases to generate. scale_std_dev: float, Normal distribution's standard deviation scale factor - used to draw the lenght of sequence. Default = 1% of the max_length. + used to draw the length of sequence. Default = 1% of the max_length. alpha: float, Zipf's Law Distribution parameter. Default = 1.5. Usually for modelling natural text distribution is in the range [1.1-1.6]. diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py index e65b47ff0..e061ceb0b 100644 --- a/tensor2tensor/data_generators/algorithmic_math.py +++ b/tensor2tensor/data_generators/algorithmic_math.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/algorithmic_math_test.py b/tensor2tensor/data_generators/algorithmic_math_test.py index 5f0de29fb..7cd67a83c 100644 --- a/tensor2tensor/data_generators/algorithmic_math_test.py +++ b/tensor2tensor/data_generators/algorithmic_math_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py index fb8ff6719..57faaa80b 100644 --- a/tensor2tensor/data_generators/algorithmic_test.py +++ b/tensor2tensor/data_generators/algorithmic_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py index 93a8a06a2..6830cf0bf 100644 --- a/tensor2tensor/data_generators/all_problems.py +++ b/tensor2tensor/data_generators/all_problems.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -33,7 +34,7 @@ # pylint: disable=g-import-not-at-top try: # Requires h5py - from tensor2tensor.data_generators import genetics + from tensor2tensor.data_generators import gene_expression except ImportError: pass # pylint: enable=g-import-not-at-top diff --git a/tensor2tensor/data_generators/audio.py b/tensor2tensor/data_generators/audio.py index 4f8c096a5..d0747a88c 100644 --- a/tensor2tensor/data_generators/audio.py +++ b/tensor2tensor/data_generators/audio.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/audio_test.py b/tensor2tensor/data_generators/audio_test.py index 1c19432c3..57e4e1ccc 100644 --- a/tensor2tensor/data_generators/audio_test.py +++ b/tensor2tensor/data_generators/audio_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py index 158bc1b59..9d7678fc4 100644 --- a/tensor2tensor/data_generators/concatenate_examples.py +++ b/tensor2tensor/data_generators/concatenate_examples.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -33,7 +34,7 @@ + subtokenizer.encode("target French Je t'aime.") + [1]) } -We add a dummy feature "inputs"=[0] for compatability with seq-to-seq models. +We add a dummy feature "inputs"=[0] for compatibility with seq-to-seq models. If FLAGS.combine_to_length is nonzero, then we combine multiple examples into examples of a constant length, possibly with some padding at the end. @@ -52,34 +53,33 @@ from tensor2tensor.data_generators import text_encoder import tensorflow as tf -tf.app.flags.DEFINE_string("vocab_file", "", - "SubwordTextEncoder vocabulary file") +tf.flags.DEFINE_string("vocab_file", "", "SubwordTextEncoder vocabulary file") -tf.app.flags.DEFINE_boolean( +tf.flags.DEFINE_boolean( "random_reverse", False, "If true, write half of the example with source/target reversed") -tf.app.flags.DEFINE_boolean( +tf.flags.DEFINE_boolean( "count_everything", False, "If true, assign positive weights to designators, source and target. " "If false, assign positive weights only to target.") -tf.app.flags.DEFINE_string("source_domain_string", "English", "") -tf.app.flags.DEFINE_string("target_domain_string", "French", "") +tf.flags.DEFINE_string("source_domain_string", "English", "") +tf.flags.DEFINE_string("target_domain_string", "French", "") -tf.app.flags.DEFINE_integer( +tf.flags.DEFINE_integer( "combine_to_length", 0, "If positive, concatenate examples to form examples with target length " " equal to this value. Targets are padded with subtoken id=0.") -tf.app.flags.DEFINE_string("in_file", "", "input filename") +tf.flags.DEFINE_string("in_file", "", "input filename") -tf.app.flags.DEFINE_string( +tf.flags.DEFINE_string( "out_prefix", "/usr/local/google/tmp/concat", "The output filename is equal to out_prefix plus " "the last 15 characters of in_file. (e.g. -00001-of-00100)") -FLAGS = tf.app.flags.FLAGS +FLAGS = tf.flags.FLAGS def _make_example(ids, weights, raw_num_bytes): diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py new file mode 100644 index 000000000..31d1cd150 --- /dev/null +++ b/tensor2tensor/data_generators/gene_expression.py @@ -0,0 +1,357 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Gene expression problems. + +Inputs are bases ACTG (with indices assigned in that order). + +Requires the h5py library. + +File format expected: + * h5 file + * h5 datasets should include {train, valid, test}_{in, na, out}, which will + map to inputs, targets mask, and targets for the train, dev, and test + datasets. + * Each record in *_in is a bool 2-D numpy array with one-hot encoded base + pairs with shape [num_input_timesteps, 4]. The base order is ACTG. + * Each record in *_na is a bool 1-D numpy array with shape + [num_output_timesteps]. + * Each record in *_out is a float 2-D numpy array with shape + [num_output_timesteps, num_predictions]. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import itertools +import math +import multiprocessing as mp +import os + +# Dependency imports + +import h5py +import numpy as np + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.data_generators import generator_utils +from tensor2tensor.data_generators import problem +from tensor2tensor.data_generators import text_encoder +from tensor2tensor.utils import metrics +from tensor2tensor.utils import registry + +import tensorflow as tf + +MAX_CONCURRENT_PROCESSES = 10 +_bases = list("ACTG") + + +class GeneExpressionProblem(problem.Problem): + """Base Problem for gene expression datasets.""" + + @property + def download_url(self): + raise NotImplementedError() + + @property + def h5_file(self): + raise NotImplementedError() + + @property + def num_output_predictions(self): + """Number of float predictions per timestep.""" + return 10 + + @property + def chunk_size(self): + return 4 + + def feature_encoders(self, data_dir): + del data_dir + return { + "inputs": DNAEncoder(chunk_size=self.chunk_size), + # TODO(rsepassi): RealEncoder? + "targets": text_encoder.TextEncoder() + } + + @property + def num_shards(self): + return 100 + + def generate_data(self, data_dir, tmp_dir, task_id=-1): + try: + # Download source data if download_url specified + h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file, + self.download_url) + except NotImplementedError: + # Otherwise, look for it locally + h5_filepath = os.path.join(tmp_dir, self.h5_file) + + with h5py.File(h5_filepath, "r") as h5_file: + num_train_examples = h5_file["train_in"].len() + num_dev_examples = h5_file["valid_in"].len() + num_test_examples = h5_file["test_in"].len() + + # Collect all_filepaths to later shuffle + all_filepaths = [] + # Collect created shard processes to start and join + processes = [] + + datasets = [(self.training_filepaths, self.num_shards, "train", + num_train_examples), (self.dev_filepaths, 1, "valid", + num_dev_examples), + (self.test_filepaths, 1, "test", num_test_examples)] + for fname_fn, nshards, key_prefix, num_examples in datasets: + outfiles = fname_fn(data_dir, nshards, shuffled=False) + all_filepaths.extend(outfiles) + for start_idx, end_idx, outfile in generate_shard_args( + outfiles, num_examples): + p = mp.Process( + target=generate_dataset, + args=(h5_filepath, key_prefix, [outfile], self.chunk_size, + start_idx, end_idx)) + processes.append(p) + + # 1 per training shard + dev + test + assert len(processes) == self.num_shards + 2 + + # Start and wait for processes in batches + num_batches = int( + math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES)) + for i in xrange(num_batches): + start = i * MAX_CONCURRENT_PROCESSES + end = start + MAX_CONCURRENT_PROCESSES + current = processes[start:end] + for p in current: + p.start() + for p in current: + p.join() + + # Shuffle + generator_utils.shuffle_dataset(all_filepaths) + + def hparams(self, defaults, model_hparams): + p = defaults + vocab_size = self._encoders["inputs"].vocab_size + p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)} + p.target_modality = ("%s:real" % registry.Modalities.GENERIC, + self.num_output_predictions) + p.input_space_id = problem.SpaceID.DNA + p.target_space_id = problem.SpaceID.REAL + + def example_reading_spec(self): + # TODO(rsepassi): propagate and apply targets_mask to output RealModality + # and to eval metrics (weights_fn?). + data_fields = { + "inputs": tf.VarLenFeature(tf.int64), + "targets_mask": tf.VarLenFeature(tf.float32), + "targets": tf.VarLenFeature(tf.float32), + } + data_items_to_decoders = None + return (data_fields, data_items_to_decoders) + + def preprocess_examples(self, examples, mode): + del mode + + # Reshape targets + examples["targets"] = tf.reshape(examples["targets"], + [-1, 1, self.num_output_predictions]) + examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1, 1]) + + # Set masked targets to 0 (i.e. pad) so that loss and metrics ignore them. + # Add epsilon because some unmasked labels are actually 0. + examples["targets"] += 1e-6 + examples["targets"] *= examples["targets_mask"] + + return examples + + def eval_metrics(self): + return [metrics.Metrics.RMSE] + + +@registry.register_problem("gene_expression_cage10") +class GeneExpressionCAGE10(GeneExpressionProblem): + + @property + def download_url(self): + return "https://storage.googleapis.com/262k_binned/cage10_l262k_w128.h5" + + @property + def h5_file(self): + return "cage10.h5" + + +@registry.register_problem("gene_expression_gm12878") +class GeneExpressionGM12878(GeneExpressionProblem): + + @property + def download_url(self): + return "https://storage.googleapis.com/262k_binned/gm12878_l262k_w128.h5" + + @property + def h5_file(self): + return "gm12878.h5" + + +@registry.register_problem("gene_expression_l262k") +class GeneExpressionL262k(GeneExpressionProblem): + + @property + def h5_file(self): + return "l262k_w128.h5" + + +def generate_shard_args(outfiles, num_examples): + """Generate start and end indices per outfile.""" + num_shards = len(outfiles) + num_examples_per_shard = num_examples // num_shards + start_idxs = [i * num_examples_per_shard for i in xrange(num_shards)] + end_idxs = list(start_idxs) + end_idxs.pop(0) + end_idxs.append(num_examples) + return zip(start_idxs, end_idxs, outfiles) + + +def generate_dataset(h5_filepath, + key_prefix, + out_filepaths, + chunk_size=1, + start_idx=None, + end_idx=None): + print("PID: %d, Key: %s, (Start, End): (%s, %s)" % (os.getpid(), key_prefix, + start_idx, end_idx)) + generator_utils.generate_files( + dataset_generator(h5_filepath, key_prefix, chunk_size, start_idx, + end_idx), out_filepaths) + + +def dataset_generator(filepath, + dataset, + chunk_size=1, + start_idx=None, + end_idx=None): + encoder = DNAEncoder(chunk_size=chunk_size) + with h5py.File(filepath, "r") as h5_file: + # Get input keys from h5_file + src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]] + src_values = [h5_file[k] for k in src_keys] + inp_data, mask_data, out_data = src_values + assert len(set([v.len() for v in src_values])) == 1 + + if start_idx is None: + start_idx = 0 + if end_idx is None: + end_idx = inp_data.len() + + for i in xrange(start_idx, end_idx): + if i % 100 == 0: + print("Generating example %d for %s" % (i, dataset)) + inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i] + yield to_example_dict(encoder, inputs, mask, outputs) + + +def to_example_dict(encoder, inputs, mask, outputs): + """Convert single h5 record to an example dict.""" + # Inputs + bases = [] + input_ids = [] + last_idx = -1 + for row in np.argwhere(inputs): + idx, base_id = row + idx, base_id = int(idx), int(base_id) + assert idx > last_idx # if not, means 2 True values in 1 row + # Some rows are all False. Those rows are mapped to UNK_ID. + while idx != last_idx + 1: + bases.append(encoder.UNK) + last_idx += 1 + bases.append(_bases[base_id]) + last_idx = idx + assert len(inputs) == len(bases) + + input_ids = encoder.encode(bases) + input_ids.append(text_encoder.EOS_ID) + + # Targets: mask and output + targets_mask = [float(v) for v in mask] + # The output is (n, m); store targets_shape so that it can be reshaped + # properly on the other end. + targets = [float(v) for v in outputs.flatten()] + targets_shape = [int(dim) for dim in outputs.shape] + assert mask.shape[0] == outputs.shape[0] + + example_keys = ["inputs", "targets_mask", "targets", "targets_shape"] + ex_dict = dict( + zip(example_keys, [input_ids, targets_mask, targets, targets_shape])) + return ex_dict + + +class DNAEncoder(text_encoder.TextEncoder): + """ACTG strings to ints and back. Optionally chunks bases into single ids. + + Uses 'X' as an unknown base. + """ + UNK = "X" + PAD = "0" + + def __init__(self, + chunk_size=1, + num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS): + super(DNAEncoder, self).__init__(num_reserved_ids=num_reserved_ids) + # Build a vocabulary of chunks of size chunk_size + self._chunk_size = chunk_size + chunks = [] + for size in range(1, chunk_size + 1): + c = itertools.product(_bases + [DNAEncoder.UNK], repeat=size) + num_pad = chunk_size - size + padding = (DNAEncoder.PAD,) * num_pad + c = [el + padding for el in c] + chunks.extend(c) + chunks.sort() + ids = range(self._num_reserved_ids, len(chunks) + self._num_reserved_ids) + self._ids_to_chunk = dict(zip(ids, chunks)) + self._chunks_to_ids = dict(zip(chunks, ids)) + + @property + def vocab_size(self): + return len(self._ids_to_chunk) + self._num_reserved_ids + + def encode(self, s): + bases = list(s) + pad = [DNAEncoder.PAD] * (len(bases) % self._chunk_size) + bases.extend(pad) + assert (len(bases) % self._chunk_size) == 0 + num_chunks = len(bases) // self._chunk_size + ids = [] + for chunk_idx in xrange(num_chunks): + start_idx = chunk_idx * self._chunk_size + end_idx = start_idx + self._chunk_size + chunk = tuple(bases[start_idx:end_idx]) + if chunk not in self._chunks_to_ids: + raise ValueError("Unrecognized chunk %s" % chunk) + ids.append(self._chunks_to_ids[chunk]) + return ids + + def decode(self, ids): + bases = [] + for idx in ids: + if idx >= self._num_reserved_ids: + chunk = self._ids_to_chunk[idx] + if DNAEncoder.PAD in chunk: + chunk = chunk[:chunk.index(DNAEncoder.PAD)] + else: + chunk = [text_encoder.RESERVED_TOKENS[idx]] + bases.extend(chunk) + return "".join(bases) diff --git a/tensor2tensor/data_generators/genetics_test.py b/tensor2tensor/data_generators/gene_expression_test.py similarity index 70% rename from tensor2tensor/data_generators/genetics_test.py rename to tensor2tensor/data_generators/gene_expression_test.py index 70b4fe495..2d7bbe832 100644 --- a/tensor2tensor/data_generators/genetics_test.py +++ b/tensor2tensor/data_generators/gene_expression_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,7 +22,7 @@ import numpy as np -from tensor2tensor.data_generators import genetics +from tensor2tensor.data_generators import gene_expression import tensorflow as tf @@ -29,21 +30,28 @@ class GeneticsTest(tf.test.TestCase): def _oneHotBases(self, bases): + ref = ["A", "C", "T", "G"] one_hots = [] - for base_id in bases: + for base in bases: one_hot = [False] * 4 - if base_id < 4: - one_hot[base_id] = True + if base in ref: + one_hot[ref.index(base)] = True one_hots.append(one_hot) return np.array(one_hots) def testRecordToExample(self): - inputs = self._oneHotBases([0, 1, 3, 4, 1, 0]) + encoder = gene_expression.DNAEncoder(chunk_size=2) + raw_inputs = ["A", "C", "G", "X", "C", "T"] + + # Put in numpy arrays in the same format as in the h5 file + inputs = self._oneHotBases(raw_inputs) mask = np.array([True, False, True]) outputs = np.array([[1.0, 2.0, 3.0], [5.0, 1.0, 0.2], [5.1, 2.3, 2.3]]) - ex_dict = genetics.to_example_dict(inputs, mask, outputs) + # Convert to example dict + ex_dict = gene_expression.to_example_dict(encoder, inputs, mask, outputs) - self.assertAllEqual([2, 3, 5, 6, 3, 2, 1], ex_dict["inputs"]) + self.assertEqual(len(raw_inputs) // 2 + 1, len(ex_dict["inputs"])) + self.assertAllEqual(encoder.encode(raw_inputs) + [1], ex_dict["inputs"]) self.assertAllEqual([1.0, 0.0, 1.0], ex_dict["targets_mask"]) self.assertAllEqual([1.0, 2.0, 3.0, 5.0, 1.0, 0.2, 5.1, 2.3, 2.3], ex_dict["targets"]) @@ -53,7 +61,7 @@ def testGenerateShardArgs(self): num_examples = 37 num_shards = 4 outfiles = [str(i) for i in range(num_shards)] - shard_args = genetics.generate_shard_args(outfiles, num_examples) + shard_args = gene_expression.generate_shard_args(outfiles, num_examples) starts, ends, fnames = zip(*shard_args) self.assertAllEqual([0, 9, 18, 27], starts) diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py index 5c0c94bce..866a0f3e7 100644 --- a/tensor2tensor/data_generators/generator_utils.py +++ b/tensor2tensor/data_generators/generator_utils.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -247,53 +248,19 @@ def gunzip_file(gz_path, new_path): ] -def get_or_generate_vocab(data_dir, tmp_dir, - vocab_filename, vocab_size, sources=None): - """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS).""" +def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size, + generator_fn): + """Inner implementation for vocab generators.""" vocab_filepath = os.path.join(data_dir, vocab_filename) if tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab - sources = sources or _DATA_FILE_URLS - tf.logging.info("Generating vocab from: %s", str(sources)) token_counts = defaultdict(int) - for source in sources: - url = source[0] - filename = os.path.basename(url) - read_type = "r:gz" if "tgz" in filename else "r" - - compressed_file = maybe_download(tmp_dir, filename, url) - - with tarfile.open(compressed_file, read_type) as corpus_tar: - corpus_tar.extractall(tmp_dir) - - for lang_file in source[1]: - tf.logging.info("Reading file: %s" % lang_file) - filepath = os.path.join(tmp_dir, lang_file) - - # For some datasets a second extraction is necessary. - if ".gz" in lang_file: - new_filepath = os.path.join(tmp_dir, lang_file[:-3]) - if tf.gfile.Exists(new_filepath): - tf.logging.info( - "Subdirectory %s already exists, skipping unpacking" % filepath) - else: - tf.logging.info("Unpacking subdirectory %s" % filepath) - gunzip_file(filepath, new_filepath) - filepath = new_filepath - - # Use Tokenizer to count the word occurrences. - with tf.gfile.GFile(filepath, mode="r") as source_file: - file_byte_budget = 3.5e5 if "en" in filepath else 7e5 - for line in source_file: - if file_byte_budget <= 0: - break - line = line.strip() - file_byte_budget -= len(line) - for tok in tokenizer.encode(text_encoder.native_to_unicode(line)): - token_counts[tok] += 1 + for item in generator_fn(): + for tok in tokenizer.encode(text_encoder.native_to_unicode(item)): + token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3) @@ -301,6 +268,55 @@ def get_or_generate_vocab(data_dir, tmp_dir, return vocab +def get_or_generate_vocab(data_dir, + tmp_dir, + vocab_filename, + vocab_size, + sources=None): + """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS).""" + sources = sources or _DATA_FILE_URLS + + def generate(): + tf.logging.info("Generating vocab from: %s", str(sources)) + for source in sources: + url = source[0] + filename = os.path.basename(url) + read_type = "r:gz" if "tgz" in filename else "r" + + compressed_file = maybe_download(tmp_dir, filename, url) + + with tarfile.open(compressed_file, read_type) as corpus_tar: + corpus_tar.extractall(tmp_dir) + + for lang_file in source[1]: + tf.logging.info("Reading file: %s" % lang_file) + filepath = os.path.join(tmp_dir, lang_file) + + # For some datasets a second extraction is necessary. + if ".gz" in lang_file: + new_filepath = os.path.join(tmp_dir, lang_file[:-3]) + if tf.gfile.Exists(new_filepath): + tf.logging.info( + "Subdirectory %s already exists, skipping unpacking" % filepath) + else: + tf.logging.info("Unpacking subdirectory %s" % filepath) + gunzip_file(filepath, new_filepath) + filepath = new_filepath + + # Use Tokenizer to count the word occurrences. + with tf.gfile.GFile(filepath, mode="r") as source_file: + file_byte_budget = 3.5e5 if "en" in filepath else 7e5 + for line in source_file: + if file_byte_budget <= 0: + break + line = line.strip() + file_byte_budget -= len(line) + yield line + + return get_or_generate_vocab_inner( + data_dir, vocab_filename, vocab_size, generator_fn=generate) + + def get_or_generate_tabbed_vocab(data_dir, tmp_dir, source_filename, index, vocab_filename, vocab_size): r"""Generate a vocabulary from a tabbed source file. @@ -320,27 +336,37 @@ def get_or_generate_tabbed_vocab(data_dir, tmp_dir, source_filename, Returns: The vocabulary. """ - vocab_filepath = os.path.join(data_dir, vocab_filename) - if os.path.exists(vocab_filepath): - vocab = text_encoder.SubwordTextEncoder(vocab_filepath) - return vocab - - # Use Tokenizer to count the word occurrences. - token_counts = defaultdict(int) - filepath = os.path.join(tmp_dir, source_filename) - with tf.gfile.GFile(filepath, mode="r") as source_file: - for line in source_file: - line = line.strip() - if line and "\t" in line: - parts = line.split("\t", maxsplit=1) - part = parts[index].strip() - for tok in tokenizer.encode(text_encoder.native_to_unicode(part)): - token_counts[tok] += 1 - - vocab = text_encoder.SubwordTextEncoder.build_to_target_size( - vocab_size, token_counts, 1, 1e3) - vocab.store_to_file(vocab_filepath) - return vocab + def generate(): + filepath = os.path.join(tmp_dir, source_filename) + tf.logging.info("Generating vocab from %s", filepath) + with tf.gfile.GFile(filepath, mode="r") as source_file: + for line in source_file: + line = line.strip() + if line and "\t" in line: + parts = line.split("\t", maxsplit=1) + part = parts[index].strip() + yield part + + return get_or_generate_vocab_inner( + data_dir, vocab_filename, vocab_size, generator_fn=generate) + + +def get_or_generate_txt_vocab(data_dir, vocab_filename, vocab_size, + filepatterns): + """Generate a vocabulary from txt files with example-per-line.""" + if isinstance(filepatterns, str): + filepatterns = [filepatterns] + + def generate(): + tf.logging.info("Generating vocab from %s", filepatterns) + for filepattern in filepatterns: + for filename in tf.gfile.Glob(filepattern): + with tf.gfile.GFile(filename, mode="r") as source_file: + for line in source_file: + yield line.strip() + + return get_or_generate_vocab_inner( + data_dir, vocab_filename, vocab_size, generator_fn=generate) def read_records(filename): diff --git a/tensor2tensor/data_generators/generator_utils_test.py b/tensor2tensor/data_generators/generator_utils_test.py index c776d120c..fd6e15ca3 100644 --- a/tensor2tensor/data_generators/generator_utils_test.py +++ b/tensor2tensor/data_generators/generator_utils_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -84,6 +85,27 @@ def testGunzipFile(self): os.remove(tmp_file_path + ".txt") os.remove(tmp_file_path) + def testGetOrGenerateTxtVocab(self): + data_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) + test_file = os.path.join(self.get_temp_dir(), "test.txt") + with tf.gfile.Open(test_file, "w") as outfile: + outfile.write("a b c\n") + outfile.write("d e f\n") + # Create a vocab over the test file. + vocab1 = generator_utils.get_or_generate_txt_vocab( + data_dir, "test.voc", 20, test_file) + self.assertTrue(tf.gfile.Exists(os.path.join(data_dir, "test.voc"))) + self.assertIsNotNone(vocab1) + + # Append a new line to the test file which would change the vocab if + # the vocab were not being read from file. + with tf.gfile.Open(test_file, "a") as outfile: + outfile.write("g h i\n") + vocab2 = generator_utils.get_or_generate_txt_vocab( + data_dir, "test.voc", 20, test_file) + self.assertTrue(tf.gfile.Exists(os.path.join(data_dir, "test.voc"))) + self.assertIsNotNone(vocab2) + self.assertEqual(vocab1.dump(), vocab2.dump()) if __name__ == "__main__": tf.test.main() diff --git a/tensor2tensor/data_generators/genetics.py b/tensor2tensor/data_generators/genetics.py deleted file mode 100644 index 255e0caf9..000000000 --- a/tensor2tensor/data_generators/genetics.py +++ /dev/null @@ -1,212 +0,0 @@ -# Copyright 2017 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Genetics problems. - -Inputs are bases ACTG (with indices assigned in that order). - -Requires the h5py library. - -File format expected: - * h5 file - * h5 datasets should include {train, valid, test}_{in, na, out}, which will - map to inputs, targets mask, and targets for the train, dev, and test - datasets. - * Each record in *_in is a bool 2-D numpy array with one-hot encoded base - pairs with shape [num_input_timesteps, 4]. The base order is ACTG. - * Each record in *_na is a bool 1-D numpy array with shape - [num_output_timesteps]. - * Each record in *_out is a float 2-D numpy array with shape - [num_output_timesteps, num_predictions]. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import multiprocessing as mp -import os - -# Dependency imports - -import h5py -import numpy as np - -from six.moves import xrange # pylint: disable=redefined-builtin - -from tensor2tensor.data_generators import generator_utils -from tensor2tensor.data_generators import problem -from tensor2tensor.data_generators import text_encoder -from tensor2tensor.utils import registry - -_bases = list("ACTG") -BASE_TO_ID = dict(zip(_bases, range(len(_bases)))) -ID_TO_BASE = dict(zip(range(len(_bases)), _bases)) -UNK_ID = len(_bases) - - -# TODO(rsepassi): -# * DataEncoder for genetic bases -# * GeneticModality and problem hparams -# * Training preprocessing - - -class GeneticsProblem(problem.Problem): - - @property - def download_url(self): - raise NotImplementedError() - - @property - def h5_file(self): - raise NotImplementedError() - - def generate_data(self, data_dir, tmp_dir, num_shards=None): - if num_shards is None: - num_shards = 100 - - # Download source data - h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file, - self.download_url) - with h5py.File(h5_filepath, "r") as h5_file: - num_train_examples = h5_file["train_in"].len() - num_dev_examples = h5_file["valid_in"].len() - num_test_examples = h5_file["test_in"].len() - - # Collect all_filepaths to later shuffle - all_filepaths = [] - # Collect created shard processes to start and join - processes = [] - - datasets = [(self.training_filepaths, num_shards, "train", - num_train_examples), (self.dev_filepaths, 1, "valid", - num_dev_examples), - (self.test_filepaths, 1, "test", num_test_examples)] - for fname_fn, nshards, key_prefix, num_examples in datasets: - outfiles = fname_fn(data_dir, nshards, shuffled=False) - all_filepaths.extend(outfiles) - for start_idx, end_idx, outfile in generate_shard_args( - outfiles, num_examples): - p = mp.Process( - target=generate_dataset, - args=(h5_filepath, key_prefix, [outfile], start_idx, end_idx)) - processes.append(p) - - # Start and wait for processes - assert len(processes) == num_shards + 2 # 1 per training shard + dev + test - for p in processes: - p.start() - for p in processes: - p.join() - - # Shuffle - generator_utils.shuffle_dataset(all_filepaths) - - -@registry.register_problem("genetics_cage10") -class GeneticsCAGE10(GeneticsProblem): - - @property - def download_url(self): - return "https://storage.googleapis.com/262k_binned/cage10_l262k_w128.h5" - - @property - def h5_file(self): - return "cage10.h5" - - -@registry.register_problem("genetics_gm12878") -class GeneticsGM12878(GeneticsProblem): - - @property - def download_url(self): - return "https://storage.googleapis.com/262k_binned/gm12878_l262k_w128.h5" - - @property - def h5_file(self): - return "gm12878.h5" - - -def generate_shard_args(outfiles, num_examples): - """Generate start and end indices per outfile.""" - num_shards = len(outfiles) - num_examples_per_shard = num_examples // num_shards - start_idxs = [i * num_examples_per_shard for i in xrange(num_shards)] - end_idxs = list(start_idxs) - end_idxs.pop(0) - end_idxs.append(num_examples) - return zip(start_idxs, end_idxs, outfiles) - - -def generate_dataset(h5_filepath, - key_prefix, - out_filepaths, - start_idx=None, - end_idx=None): - print("PID: %d, Key: %s, (Start, End): (%s, %s)" % (os.getpid(), key_prefix, - start_idx, end_idx)) - generator_utils.generate_files( - dataset_generator(h5_filepath, key_prefix, start_idx, end_idx), - out_filepaths) - - -def dataset_generator(filepath, dataset, start_idx=None, end_idx=None): - with h5py.File(filepath, "r") as h5_file: - # Get input keys from h5_file - src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]] - src_values = [h5_file[k] for k in src_keys] - inp_data, mask_data, out_data = src_values - assert len(set([v.len() for v in src_values])) == 1 - - if start_idx is None: - start_idx = 0 - if end_idx is None: - end_idx = inp_data.len() - - for i in xrange(start_idx, end_idx): - if i % 100 == 0: - print("Generating example %d for %s" % (i, dataset)) - inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i] - yield to_example_dict(inputs, mask, outputs) - - -def to_example_dict(inputs, mask, outputs): - """Convert single h5 record to an example dict.""" - # Inputs - input_ids = [] - last_idx = -1 - for row in np.argwhere(inputs): - idx, base_id = row - idx, base_id = int(idx), int(base_id) - assert idx > last_idx # if not, means 2 True values in 1 row - # Some rows are all False. Those rows are mapped to UNK_ID. - while idx != last_idx + 1: - input_ids.append(UNK_ID + text_encoder.NUM_RESERVED_TOKENS) - last_idx += 1 - input_ids.append(base_id + text_encoder.NUM_RESERVED_TOKENS) - last_idx = idx - assert len(inputs) == len(input_ids) - input_ids.append(text_encoder.EOS_ID) - - # Targets: mask and output - targets_mask = [float(v) for v in mask] - # The output is (n, m); store targets_shape so that it can be reshaped - # properly on the other end. - targets = [float(v) for v in outputs.flatten()] - targets_shape = [int(dim) for dim in outputs.shape] - assert mask.shape[0] == outputs.shape[0] - - example_keys = ["inputs", "targets_mask", "targets", "targets_shape"] - ex_dict = dict( - zip(example_keys, [input_ids, targets_mask, targets, targets_shape])) - return ex_dict diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index e3567d78f..fdad8d432 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -306,14 +307,38 @@ def mscoco_generator(data_dir, "image/width": [width] } + +class ImageProblem(problem.Problem): + + def example_reading_spec(self, label_key=None): + if label_key is None: + label_key = "image/class/label" + + data_fields = { + "image/encoded": tf.FixedLenFeature((), tf.string), + "image/format": tf.FixedLenFeature((), tf.string), + label_key: tf.VarLenFeature(tf.int64) + } + data_items_to_decoders = { + "inputs": + tf.contrib.slim.tfexample_decoder.Image( + image_key="image/encoded", + format_key="image/format", + channels=3), + "targets": + tf.contrib.slim.tfexample_decoder.Tensor(label_key), + } + + return data_fields, data_items_to_decoders + # French street names dataset. @registry.register_problem -class ImageFSNS(problem.Problem): +class ImageFSNS(ImageProblem): """Problem spec for French Street Name recognition.""" - def generate_data(self, data_dir, tmp_dir): + def generate_data(self, data_dir, tmp_dir, task_id=-1): list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" "street/python/fsns_urls.txt") fsns_urls = generator_utils.maybe_download( @@ -350,6 +375,10 @@ def hparams(self, defaults, model_hparams): p.input_space_id = problem.SpaceID.DIGIT_0 p.target_space_id = problem.SpaceID.DIGIT_1 + def example_reading_spec(self): + label_key = "image/unpadded_label" + return super(ImageFSNS, self).example_reading_spec(self, + label_key=label_key) # Filename for CELEBA data. _CELEBA_NAME = "img_align_celeba" diff --git a/tensor2tensor/data_generators/image_test.py b/tensor2tensor/data_generators/image_test.py index 6c9984265..59cad4226 100644 --- a/tensor2tensor/data_generators/image_test.py +++ b/tensor2tensor/data_generators/image_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/inspect.py b/tensor2tensor/data_generators/inspect.py index dad0c1c83..848b74a2d 100644 --- a/tensor2tensor/data_generators/inspect.py +++ b/tensor2tensor/data_generators/inspect.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -31,15 +32,16 @@ import tensorflow as tf -tf.app.flags.DEFINE_string("subword_text_encoder_filename", "", - "SubwordTextEncoder vocabulary file") -tf.app.flags.DEFINE_string("input_filename", "", "input filename") -tf.app.flags.DEFINE_bool("print_inputs", False, - "Print decoded inputs to stdout") -tf.app.flags.DEFINE_bool("print_targets", False, - "Print decoded targets to stdout") +tf.flags.DEFINE_string("subword_text_encoder_filename", "", + "SubwordTextEncoder vocabulary file") +tf.flags.DEFINE_string("token_text_encoder_filename", "", + "TokenTextEncoder vocabulary file") +tf.flags.DEFINE_bool("byte_text_encoder", False, "use a ByteTextEncoder") +tf.flags.DEFINE_string("input_filename", "", "input filename") +tf.flags.DEFINE_bool("print_inputs", False, "Print decoded inputs to stdout") +tf.flags.DEFINE_bool("print_targets", False, "Print decoded targets to stdout") -FLAGS = tf.app.flags.FLAGS +FLAGS = tf.flags.FLAGS def main(_): @@ -47,6 +49,10 @@ def main(_): if FLAGS.subword_text_encoder_filename: encoder = text_encoder.SubwordTextEncoder( FLAGS.subword_text_encoder_filename) + elif FLAGS.token_text_encoder_filename: + encoder = text_encoder.TokenTextEncoder(FLAGS.token_text_encoder_filename) + elif FLAGS.byte_text_encoder: + encoder = text_encoder.ByteTextEncoder() else: encoder = None reader = tf.python_io.tf_record_iterator(FLAGS.input_filename) diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py index 78fb001bc..a436e0e6e 100644 --- a/tensor2tensor/data_generators/lm1b.py +++ b/tensor2tensor/data_generators/lm1b.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -62,7 +63,7 @@ def _original_vocab(tmp_dir): def _replace_oov(original_vocab, line): """Replace out-of-vocab words with "UNK". - This maintains compatability with published results. + This maintains compatibility with published results. Args: original_vocab: a set of strings (The standard vocabulary for the dataset) @@ -137,12 +138,13 @@ def _get_or_build_subword_text_encoder(tmp_dir): return ret -def generator(tmp_dir, train): +def generator(tmp_dir, train, characters=False): """Generator for lm1b sentences. Args: tmp_dir: a string. train: a boolean. + characters: a boolean Yields: A dictionary {"inputs": [0], "targets": []} @@ -151,7 +153,10 @@ def generator(tmp_dir, train): original_vocab = _original_vocab(tmp_dir) files = (_train_data_filenames(tmp_dir) if train else [_dev_data_filename(tmp_dir)]) - encoder = _get_or_build_subword_text_encoder(tmp_dir) + if characters: + encoder = text_encoder.ByteTextEncoder() + else: + encoder = _get_or_build_subword_text_encoder(tmp_dir) for filepath in files: tf.logging.info("filepath = %s", filepath) for line in tf.gfile.Open(filepath): diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 5beb0385f..67e3c6f90 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,6 +22,7 @@ from tensor2tensor.data_generators import generator_utils as utils from tensor2tensor.data_generators import text_encoder +from tensor2tensor.utils import metrics import tensorflow as tf @@ -73,6 +75,10 @@ class SpaceID(object): CS_TOK = 21 # Czech characters CS_CHR = 22 + # Genetic bases (ACTG) + DNA = 23 + # Real numbers + REAL = 24 class Problem(object): @@ -106,6 +112,17 @@ class Problem(object): * hparams(defaults, model_hparams) - Specify the problem hyperparameters (see _default_hparams) - Mutate defaults as needed + * example_reading_spec + - Specify the names and types of the features on disk. + - Specify tf.contrib.slim.tfexample_decoder + * preprocess_examples(examples, mode) + - Preprocess the example feature dict from feature name to Tensor or + SparseTensor. + - Used in training, eval, and inference (specified by mode). + + Eval: + * eval_metrics + - Specify the set of evaluation metrics for this problem. Inference: * feature_encoders(data_dir) @@ -118,7 +135,7 @@ class Problem(object): # BEGIN SUBCLASS INTERFACE # ============================================================================ - def generate_data(self, data_dir, tmp_dir, num_shards=None): + def generate_data(self, data_dir, tmp_dir, task_id=-1): raise NotImplementedError() def hparams(self, defaults, model_hparams): @@ -134,6 +151,24 @@ def feature_encoders(self, data_dir): "targets": text_encoder.TextEncoder() } + def example_reading_spec(self): + data_fields = { + "inputs": tf.VarLenFeature(tf.int64), + "targets": tf.VarLenFeature(tf.int64) + } + data_items_to_decoders = None + return (data_fields, data_items_to_decoders) + + def preprocess_examples(self, examples, mode): + del mode + return examples + + def eval_metrics(self): + return [ + metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, + metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY + ] + # ============================================================================ # END SUBCLASS INTERFACE # ============================================================================ @@ -196,6 +231,17 @@ def internal_hparams(self, model_hparams): _copy_problem_hparams(hp) return hp + def maybe_reverse_features(self, feature_map): + if not self._was_reversed: + return + inputs, targets = feature_map["inputs"], feature_map["targets"] + feature_map["inputs"], feature_map["targets"] = targets, inputs + + def maybe_copy_features(self, feature_map): + if not self._was_copy: + return + feature_map["targets"] = feature_map["inputs"] + def _copy_problem_hparams(p_hparams): """Use input modality, vocab, and space id for target.""" diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index 51bc0ba62..2792c79e9 100644 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -335,6 +336,22 @@ def lm1b_32k(model_hparams): return p +def lm1b_characters(unused_model_hparams): + """Billion-word language-modeling benchmark, 32k subword vocabulary.""" + p = default_problem_hparams() + # ratio of dev tokens (including eos) to dev words (including eos) + # 826189 / 159658 = 5.174742 + p.perplexity_exponent = 5.174742 + p.input_modality = {} + encoder = text_encoder.ByteTextEncoder() + p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size) + p.vocabulary = { + "targets": encoder + } + p.target_space_id = 2 + return p + + def wiki_32k(model_hparams): """Wikipedia title to article. 32k subtoken vocabulary.""" p = default_problem_hparams() @@ -622,6 +639,7 @@ def image_celeba(unused_model_hparams): "audio_wsj_characters_test": audio_wsj_characters, "audio_wsj_tokens_8k_tune": lambda p: audio_wsj_tokens(p, 2**13), "audio_wsj_tokens_8k_test": lambda p: audio_wsj_tokens(p, 2**13), + "lm1b_characters": lm1b_characters, "lm1b_32k": lm1b_32k, "wiki_32k": wiki_32k, "lmptb_10k": lmptb_10k, diff --git a/tensor2tensor/data_generators/problem_hparams_test.py b/tensor2tensor/data_generators/problem_hparams_test.py index ad1f0192d..df92919ef 100644 --- a/tensor2tensor/data_generators/problem_hparams_test.py +++ b/tensor2tensor/data_generators/problem_hparams_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py index 9a7db3a78..f71f0d902 100644 --- a/tensor2tensor/data_generators/ptb.py +++ b/tensor2tensor/data_generators/ptb.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/snli.py b/tensor2tensor/data_generators/snli.py index 7322c59ff..cd4ff723d 100644 --- a/tensor2tensor/data_generators/snli.py +++ b/tensor2tensor/data_generators/snli.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/test_data/corpus-1.txt b/tensor2tensor/data_generators/test_data/corpus-1.txt new file mode 100644 index 000000000..c05e47f90 --- /dev/null +++ b/tensor2tensor/data_generators/test_data/corpus-1.txt @@ -0,0 +1,4 @@ +One morning I shot an elephant in my pajamas. How he got in my pajamas, I don't +know. + +Groucho Marx diff --git a/tensor2tensor/data_generators/test_data/corpus-2.txt b/tensor2tensor/data_generators/test_data/corpus-2.txt new file mode 100644 index 000000000..f45577c4b --- /dev/null +++ b/tensor2tensor/data_generators/test_data/corpus-2.txt @@ -0,0 +1,3 @@ +I haven't slept for 10 days... because that would be too long. + +Mitch Hedberg diff --git a/tensor2tensor/data_generators/test_data/vocab-1.txt b/tensor2tensor/data_generators/test_data/vocab-1.txt new file mode 100644 index 000000000..d34d3d957 --- /dev/null +++ b/tensor2tensor/data_generators/test_data/vocab-1.txt @@ -0,0 +1,2 @@ +lollipop,8 +reverberated,12 diff --git a/tensor2tensor/data_generators/test_data/vocab-2.txt b/tensor2tensor/data_generators/test_data/vocab-2.txt new file mode 100644 index 000000000..7793af4f6 --- /dev/null +++ b/tensor2tensor/data_generators/test_data/vocab-2.txt @@ -0,0 +1,3 @@ +kattywampus,11 +balderdash,10 +jiggery-pokery,14 diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 8be22ce0b..4bb1c875d 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,20 +24,17 @@ from __future__ import division from __future__ import print_function -from collections import defaultdict +import collections import re # Dependency imports import six -from six import PY2 -from six import unichr # pylint: disable=redefined-builtin from six.moves import xrange # pylint: disable=redefined-builtin from tensor2tensor.data_generators import tokenizer import tensorflow as tf - # Reserved tokens for things like padding and EOS symbols. PAD = "" EOS = "" @@ -45,7 +43,7 @@ PAD_ID = RESERVED_TOKENS.index(PAD) # Normally 0 EOS_ID = RESERVED_TOKENS.index(EOS) # Normally 1 -if PY2: +if six.PY2: RESERVED_TOKENS_BYTES = RESERVED_TOKENS else: RESERVED_TOKENS_BYTES = [bytes(PAD, "ascii"), bytes(EOS, "ascii")] @@ -55,18 +53,17 @@ # '\u' is converted to '_' # '\\' is converted to '\' # '\213;' is converted to unichr(213) -_UNESCAPE_REGEX = re.compile(u"|".join([r"\\u", r"\\\\", r"\\([0-9]+);"])) +_UNESCAPE_REGEX = re.compile(ur"\\u|\\\\|\\([0-9]+);") +_ESCAPE_CHARS = set(u"\\_;0123456789") def native_to_unicode_py2(s): """Python 2: transform native string to Unicode.""" - if isinstance(s, unicode): - return s - return s.decode("utf-8") + return s if isinstance(s, unicode) else s.decode("utf8") # Conversion between Unicode and UTF-8, if required (on Python2) -if PY2: +if six.PY2: native_to_unicode = native_to_unicode_py2 unicode_to_native = lambda s: s.encode("utf-8") else: @@ -130,7 +127,9 @@ class ByteTextEncoder(TextEncoder): def encode(self, s): numres = self._num_reserved_ids - if PY2: + if six.PY2: + if isinstance(s, unicode): + s = s.encode("utf-8") return [ord(c) + numres for c in s] # Python3: explicitly convert to UTF-8 return [c + numres for c in s.encode("utf-8")] @@ -144,7 +143,7 @@ def decode(self, ids): decoded_ids.append(RESERVED_TOKENS_BYTES[int(id_)]) else: decoded_ids.append(int2byte(id_ - numres)) - if PY2: + if six.PY2: return "".join(decoded_ids) # Python3: join byte arrays and then decode string return b"".join(decoded_ids).decode("utf-8", "replace") @@ -198,6 +197,55 @@ def _load_vocab_from_file(self, filename): self._id_to_token[idx] = tok +def _escape_token(token, alphabet): + """Escape away underscores and OOV characters and append '_'. + + This allows the token to be experessed as the concatenation of a list + of subtokens from the vocabulary. The underscore acts as a sentinel + which allows us to invertibly concatenate multiple such lists. + + Args: + token: A unicode string to be escaped. + alphabet: A set of all characters in the vocabulary's alphabet. + + Returns: + escaped_token: An escaped unicode string. + + Raises: + ValueError: If the provided token is not unicode. + """ + if not isinstance(token, six.text_type): + raise ValueError("Expected string type for token, got %s" % type(token)) + + token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u") + ret = [ + c if c in alphabet and c != u"\n" else ur"\%d;" % ord(c) + for c in token] + return u"".join(ret) + "_" + + +def _unescape_token(escaped_token): + """Inverse of _escape_token(). + + Args: + escaped_token: a unicode string + + Returns: + token: a unicode string + """ + def match(m): + if m.group(1) is None: + return u"_" if m.group(0) == u"\\u" else u"\\" + + try: + return six.unichr(int(m.group(1))) + except (ValueError, OverflowError) as _: + return "" + + trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token + return _UNESCAPE_REGEX.sub(match, trimmed) + + class SubwordTextEncoder(TextEncoder): """Class for invertibly encoding text using a limited vocabulary. @@ -246,7 +294,7 @@ def encode(self, raw_text): Returns: a list of integers in the range [0, vocab_size) """ - return self._tokens_to_subtokens(tokenizer.encode( + return self._tokens_to_subtoken_ids(tokenizer.encode( native_to_unicode(raw_text))) def decode(self, subtokens): @@ -258,14 +306,14 @@ def decode(self, subtokens): a native string """ return unicode_to_native(tokenizer.decode( - self._subtokens_to_tokens(subtokens))) + self._subtoken_ids_to_tokens(subtokens))) @property def vocab_size(self): """The subtoken vocabulary size.""" return len(self._all_subtoken_strings) - def _tokens_to_subtokens(self, tokens): + def _tokens_to_subtoken_ids(self, tokens): """Converts a list of tokens to a list of subtoken ids. Args: @@ -275,10 +323,11 @@ def _tokens_to_subtokens(self, tokens): """ ret = [] for token in tokens: - ret.extend(self._escaped_token_to_subtokens(self._escape_token(token))) + ret.extend(self._escaped_token_to_subtoken_ids( + _escape_token(token, self._alphabet))) return ret - def _subtokens_to_tokens(self, subtokens): + def _subtoken_ids_to_tokens(self, subtokens): """Converts a list of subtoken ids to a list of tokens. Args: @@ -287,40 +336,58 @@ def _subtokens_to_tokens(self, subtokens): a list of strings. """ concatenated = "".join( - [self._subtoken_to_subtoken_string(s) for s in subtokens]) + [self._subtoken_id_to_subtoken_string(s) for s in subtokens]) split = concatenated.split("_") - return [self._unescape_token(t + "_") for t in split if t] + return [_unescape_token(t + "_") for t in split if t] - def _subtoken_to_subtoken_string(self, subtoken): - """Subtoken_String (string) corresponding to the given subtoken (id).""" + def _subtoken_id_to_subtoken_string(self, subtoken): + """Converts a subtoken integer ID to a subtoken string.""" if 0 <= subtoken < self.vocab_size: return self._all_subtoken_strings[subtoken] return u"" - def _escaped_token_to_subtokens(self, escaped_token): - """Converts an escaped token string to a list of subtokens. + def _escaped_token_to_subtoken_strings(self, escaped_token): + """Converts an escaped token string to a list of subtoken strings. Args: - escaped_token: an escaped token + escaped_token: An escaped token as a unicode string. Returns: - a list of one or more integers. + A list of subtokens as unicode strings. """ + # NOTE: This algorithm is greedy; it won't necessarily produce the "best" + # list of subtokens. ret = [] - pos = 0 - lesc = len(escaped_token) - while pos < lesc: - end = min(lesc, pos + self._max_subtoken_len) - while end > pos: - subtoken = self._subtoken_string_to_id.get(escaped_token[pos:end], -1) - if subtoken != -1: + start = 0 + token_len = len(escaped_token) + while start < token_len: + for end in xrange( + min(token_len, start + self._max_subtoken_len), start, -1): + subtoken = escaped_token[start:end] + if subtoken in self._subtoken_string_to_id: + ret.append(subtoken) + start = end break - end -= 1 - assert end > pos - ret.append(subtoken) - pos = end + + else: # Did not break + # If there is no possible encoding of the escaped token then one of the + # characters in the token is not in the alphabet. This should be + # impossible and would be indicative of a bug. + assert False, "Token substring not found in subtoken vocabulary." return ret + def _escaped_token_to_subtoken_ids(self, escaped_token): + """Converts an escaped token string to a list of subtoken IDs. + + Args: + escaped_token: An escaped token as a unicode string. + Returns: + A list of subtoken IDs as integers. + """ + return [ + self._subtoken_string_to_id[subtoken] + for subtoken in self._escaped_token_to_subtoken_strings(escaped_token)] + @classmethod def build_to_target_size(cls, target_size, @@ -330,27 +397,37 @@ def build_to_target_size(cls, num_iterations=4): """Builds a SubwordTextEncoder that has `vocab_size` near `target_size`. - Uses simple recursive binary search to find a `min_count` value that most + Uses simple recursive binary search to find a minimum token count that most closely matches the `target_size`. Args: - target_size: desired vocab_size to approximate. - token_counts: a dictionary of string to int. - min_val: an integer - lower bound for `min_count`. - max_val: an integer - upper bound for `min_count`. - num_iterations: an integer. how many iterations of refinement. + target_size: Desired vocab_size to approximate. + token_counts: A dictionary of token counts, mapping string to int. + min_val: An integer; lower bound for the minimum token count. + max_val: An integer; upper bound for the minimum token count. + num_iterations: An integer; how many iterations of refinement. Returns: - a SubwordTextEncoder instance. + A SubwordTextEncoder instance. + + Raises: + ValueError: If `min_val` is greater than `max_val`. """ + if min_val > max_val: + raise ValueError( + "Lower bound for the minimum token count " + "is greater than the upper bound.") + def bisect(min_val, max_val): """Bisection to find the right size.""" present_count = (max_val + min_val) // 2 tf.logging.info("Trying min_count %d" % present_count) subtokenizer = cls() - subtokenizer.build_from_token_counts(token_counts, - present_count, num_iterations) - if min_val >= max_val or subtokenizer.vocab_size == target_size: + subtokenizer.build_from_token_counts( + token_counts, present_count, num_iterations) + + # If min_val == max_val, we can't do any better than this. + if subtokenizer.vocab_size == target_size or min_val == max_val: return subtokenizer if subtokenizer.vocab_size > target_size: @@ -381,19 +458,12 @@ def build_from_token_counts(self, num_iterations: an integer. how many iterations of refinement. num_reserved_ids: an integer. how many ids to reserve for special tokens. """ - # first determine the alphabet to include all characters with count at - # least min_count in the dataset. - char_counts = defaultdict(int) - for token, count in six.iteritems(token_counts): - for c in token: - char_counts[c] += count - self._alphabet = set() - for c, count in six.iteritems(char_counts): - if count >= min_count: - self._alphabet.add(c) - # Make sure all characters needed for escaping are included - for c in u"\\_;0123456789": - self._alphabet.add(c) + self._init_alphabet_from_tokens(six.iterkeys(token_counts)) + + # Bootstrap the initial list of subtokens with the characters from the + # alphabet plus the escaping characters. + self._init_subtokens_from_list( + list(self._alphabet), reserved=num_reserved_ids) # We build iteratively. On each iteration, we segment all the words, # then count the resulting potential subtokens, keeping the ones @@ -402,63 +472,54 @@ def build_from_token_counts(self, min_count = 1 for i in xrange(num_iterations): tf.logging.info("Iteration {0}".format(i)) - counts = defaultdict(int) + + # Collect all substrings of the encoded token that break along current + # subtoken boundaries. + subtoken_counts = collections.defaultdict(int) for token, count in six.iteritems(token_counts): - escaped_token = self._escape_token(token) - # we will count all tails of the escaped_token, starting from boundaries - # determined by our current segmentation. - if i == 0: - starts = xrange(len(escaped_token)) - else: - subtokens = self._escaped_token_to_subtokens(escaped_token) - pos = 0 - starts = [] - for subtoken in subtokens: - starts.append(pos) - pos += len(self._all_subtoken_strings[subtoken]) - for start in starts: + escaped_token = _escape_token(token, self._alphabet) + subtokens = self._escaped_token_to_subtoken_strings(escaped_token) + start = 0 + for subtoken in subtokens: for end in xrange(start + 1, len(escaped_token) + 1): - subtoken_string = escaped_token[start:end] - counts[subtoken_string] += count - # Make sure all characters needed for escaping are included - for c in self._alphabet: - counts[c] += min_count - # Array of sets of candidate subtoken strings, by length + new_subtoken = escaped_token[start:end] + subtoken_counts[new_subtoken] += count + start += len(subtoken) + + # Array of sets of candidate subtoken strings, by length. len_to_subtoken_strings = [] - for subtoken_string, count in six.iteritems(counts): + for subtoken_string, count in six.iteritems(subtoken_counts): lsub = len(subtoken_string) if count >= min_count: - # Add this subtoken string to its length set while len(len_to_subtoken_strings) <= lsub: len_to_subtoken_strings.append(set()) len_to_subtoken_strings[lsub].add(subtoken_string) - new_subtoken_strings = [] - # consider the candidates longest to shortest, so that if we accept + + # Consider the candidates longest to shortest, so that if we accept # a longer subtoken string, we can decrement the counts of its prefixes. - for lsub in reversed(range(1, len(len_to_subtoken_strings))): + new_subtoken_strings = [] + for lsub in xrange(len(len_to_subtoken_strings)-1, 0, -1): subtoken_strings = len_to_subtoken_strings[lsub] for subtoken_string in subtoken_strings: - count = counts[subtoken_string] + count = subtoken_counts[subtoken_string] if count >= min_count: - new_subtoken_strings.append((count, subtoken_string)) + # Exclude alphabet tokens here, as they must be included later, + # explicitly, regardless of count. + if subtoken_string not in self._alphabet: + new_subtoken_strings.append((count, subtoken_string)) for l in xrange(1, lsub): - counts[subtoken_string[:l]] -= count - # Sort in decreasing order by count + subtoken_counts[subtoken_string[:l]] -= count + + # Include the alphabet explicitly to guarantee all strings are encodable. + new_subtoken_strings.extend( + (subtoken_counts.get(a, 0), a) for a in self._alphabet) new_subtoken_strings.sort(reverse=True) - # Now we have a candidate vocabulary - old_alphabet = self._alphabet - self._init_from_list([u""] * num_reserved_ids + - [p[1] for p in new_subtoken_strings]) - assert old_alphabet == self._alphabet - tf.logging.info("vocab_size = %d" % self.vocab_size) - original = "This sentence was encoded by the SubwordTextEncoder." - encoded = self.encode(original) - print(encoded) - print([self._subtoken_to_subtoken_string(s) for s in encoded]) - decoded = self.decode(encoded) - print(decoded) - assert decoded == original + # Reinitialize to the candidate vocabulary. + self._init_subtokens_from_list( + [subtoken for _, subtoken in new_subtoken_strings], + reserved=num_reserved_ids) + tf.logging.info("vocab_size = %d" % self.vocab_size) def dump(self): """Debugging dump of the current subtoken vocabulary.""" @@ -467,15 +528,21 @@ def dump(self): print(u", ".join(u"{0} : '{1}'".format(i, s) for i, s in sorted(subtoken_strings))) - def _init_from_list(self, subtoken_strings): - """Initialize from a list of subtoken strings.""" - self._all_subtoken_strings = subtoken_strings + def _init_subtokens_from_list(self, subtoken_strings, reserved=0): + """Initialize token information from a list of subtoken strings.""" + self._all_subtoken_strings = [u""] * reserved + subtoken_strings # we remember the maximum length of any subtoken to avoid having to # check arbitrarily long strings. self._max_subtoken_len = max([len(s) for s in subtoken_strings]) self._subtoken_string_to_id = { - s: i for i, s in enumerate(subtoken_strings) if s} - self._alphabet = set([c for c in subtoken_strings if len(c) == 1]) + s: i+reserved for i, s in enumerate(subtoken_strings) if s} + + def _init_alphabet_from_tokens(self, tokens): + """Initialize alphabet from an iterable of token or subtoken strings.""" + # Include all characters from all tokens in the alphabet to guarantee that + # any token can be encoded. Additionally, include all escaping characters. + self._alphabet = {c for token in tokens for c in token} + self._alphabet |= _ESCAPE_CHARS def _load_from_file(self, filename): """Load from a file.""" @@ -483,51 +550,10 @@ def _load_from_file(self, filename): with tf.gfile.Open(filename) as f: for line in f: subtoken_strings.append(native_to_unicode(line.strip()[1:-1])) - self._init_from_list(subtoken_strings) + self._init_subtokens_from_list(subtoken_strings) + self._init_alphabet_from_tokens(subtoken_strings) def store_to_file(self, filename): with tf.gfile.Open(filename, "w") as f: for subtoken_string in self._all_subtoken_strings: f.write("'" + unicode_to_native(subtoken_string) + "'\n") - - def _escape_token(self, token): - """Escape away underscores and OOV characters and append '_'. - - This allows the token to be experessed as the concatenation of a list - of subtokens from the vocabulary. The underscore acts as a sentinel - which allows us to invertibly concatenate multiple such lists. - - Args: - token: a unicode string - Returns: - escaped_token: a unicode string - """ - assert isinstance(token, six.text_type) - token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u") + u"_" - ret = u"" - for c in token: - if c in self._alphabet and c != u"\n": - ret += c - else: - ret += u"\\%d;" % ord(c) - return ret - - def _unescape_token(self, escaped_token): - """Inverse of _escape_token(). - - Args: - escaped_token: a unicode string - Returns: - token: a unicode string - """ - def match(m): - if m.group(1) is not None: - # Convert '\213;' to unichr(213) - try: - return unichr(int(m.group(1))) - except (ValueError, OverflowError) as _: - return "" - # Convert '\u' to '_' and '\\' to '\' - return u"_" if m.group(0) == u"\\u" else u"\\" - # Cut off the trailing underscore and apply the regex substitution - return _UNESCAPE_REGEX.sub(match, escaped_token[:-1]) diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py index 093101c68..47e82a176 100644 --- a/tensor2tensor/data_generators/text_encoder_build_subword.py +++ b/tensor2tensor/data_generators/text_encoder_build_subword.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -38,29 +39,43 @@ import tensorflow as tf -tf.app.flags.DEFINE_string('output_fn', '/tmp/my.subword_text_encoder', - 'where to store the SubwordTextEncoder') -tf.app.flags.DEFINE_string('corpus_filepattern', '', - 'Corpus of one or more text files') -tf.app.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus') -tf.app.flags.DEFINE_integer('corpus_max_lines', 10000, - 'How many lines of corpus to read') -tf.app.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations') -tf.app.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.') -FLAGS = tf.app.flags.FLAGS +tf.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder', + 'where to store the SubwordTextEncoder') +tf.flags.DEFINE_string('corpus_filepattern', '', + 'Corpus of one or more text files') +tf.flags.DEFINE_string('vocab_filepattern', '', 'One or more vocabulary files ' + '(one word per line as "word,count")') +tf.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus') +tf.flags.DEFINE_integer('corpus_max_lines', 10000, + 'How many lines of corpus to read') +tf.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations') +tf.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.') +FLAGS = tf.flags.FLAGS def main(unused_argv): - gs = text_encoder.SubwordTextEncoder() - if not FLAGS.corpus_filepattern: - raise ValueError('Must provide --corpus_filepattern') - token_counts = tokenizer.corpus_token_counts( - FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, - split_on_newlines=FLAGS.split_on_newlines) - gs.build_from_token_counts(token_counts, - FLAGS.min_count, - FLAGS.num_iterations) - gs.store_to_file(FLAGS.output_fn) + if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: + raise ValueError( + 'Must only provide one of --corpus_filepattern or --vocab_filepattern') + + elif FLAGS.corpus_filepattern: + token_counts = tokenizer.corpus_token_counts( + FLAGS.corpus_filepattern, + FLAGS.corpus_max_lines, + split_on_newlines=FLAGS.split_on_newlines) + + elif FLAGS.vocab_filepattern: + token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, + FLAGS.corpus_max_lines) + + else: + raise ValueError( + 'Must provide one of --corpus_filepattern or --vocab_filepattern') + + encoder = text_encoder.SubwordTextEncoder() + encoder.build_from_token_counts(token_counts, FLAGS.min_count, + FLAGS.num_iterations) + encoder.store_to_file(FLAGS.output_fn) if __name__ == '__main__': diff --git a/tensor2tensor/data_generators/text_encoder_test.py b/tensor2tensor/data_generators/text_encoder_test.py new file mode 100644 index 000000000..4142f8699 --- /dev/null +++ b/tensor2tensor/data_generators/text_encoder_test.py @@ -0,0 +1,145 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensor2tensor.data_generators.text_encoder.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import collections + +# Dependency imports +import mock + +from tensor2tensor.data_generators import text_encoder +import tensorflow as tf + + +class EscapeUnescapeTokenTest(tf.test.TestCase): + + def test_escape_token(self): + escaped = text_encoder._escape_token( + 'Foo! Bar.\nunder_score back\\slash', + set('abcdefghijklmnopqrstuvwxyz .\n') | text_encoder._ESCAPE_CHARS) + + self.assertEqual( + '\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_', escaped) + + def test_unescape_token(self): + unescaped = text_encoder._unescape_token( + '\\70;oo\\33; \\66;ar.\\10;under\\uscore back\\\\slash_') + + self.assertEqual( + 'Foo! Bar.\nunder_score back\\slash', unescaped) + + +class SubwordTextEncoderTest(tf.test.TestCase): + + def test_encode_decode(self): + corpus = ( + 'This is a corpus of text that provides a bunch of tokens from which ' + 'to build a vocabulary. It will be used when strings are encoded ' + 'with a TextEncoder subclass. The encoder was coded by a coder.') + token_counts = collections.Counter(corpus.split(' ')) + alphabet = set(corpus) ^ {' '} + + original = 'This is a coded sentence encoded by the SubwordTextEncoder.' + token_counts.update(original.split(' ')) + + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( + 100, token_counts, 2, 10) + + # Encoding should be reversible. + encoded = encoder.encode(original) + decoded = encoder.decode(encoded) + self.assertEqual(original, decoded) + + # The substrings coded and coder are frequent enough in the corpus that + # they should appear in the vocabulary even though they are substrings + # of other included strings. + subtoken_strings = {encoder._all_subtoken_strings[i] for i in encoded} + self.assertIn('encoded_', subtoken_strings) + self.assertIn('coded_', subtoken_strings) + self.assertIn('TextEncoder', encoder._all_subtoken_strings) + self.assertIn('coder', encoder._all_subtoken_strings) + + # Every character in the corpus should be in the encoder's alphabet and + # its subtoken vocabulary. + self.assertTrue(alphabet.issubset(encoder._alphabet)) + for a in alphabet: + self.assertIn(a, encoder._all_subtoken_strings) + + def test_unicode(self): + corpus = 'Cat emoticons. \U0001F638 \U0001F639 \U0001F63A \U0001F63B' + token_counts = collections.Counter(corpus.split(' ')) + + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( + 100, token_counts, 2, 10) + + self.assertIn('\U0001F638', encoder._alphabet) + self.assertIn('\U0001F63B', encoder._all_subtoken_strings) + + def test_small_vocab(self): + corpus = 'The quick brown fox jumps over the lazy dog' + token_counts = collections.Counter(corpus.split(' ')) + alphabet = set(corpus) ^ {' '} + + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( + 10, token_counts, 2, 10) + + # All vocabulary elements are in the alphabet and subtoken strings even + # if we requested a smaller vocabulary to assure all expected strings + # are encodable. + self.assertTrue(alphabet.issubset(encoder._alphabet)) + for a in alphabet: + self.assertIn(a, encoder._all_subtoken_strings) + + def test_encodable_when_not_in_alphabet(self): + corpus = 'the quick brown fox jumps over the lazy dog' + token_counts = collections.Counter(corpus.split(' ')) + + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( + 100, token_counts, 2, 10) + original = 'This has UPPER CASE letters that are out of alphabet' + + # Early versions could have an infinite loop when breaking into subtokens + # if there was any out-of-alphabet characters in the encoded string. + encoded = encoder.encode(original) + decoded = encoder.decode(encoded) + + self.assertEqual(original, decoded) + encoded_str = ''.join(encoder._all_subtoken_strings[i] for i in encoded) + self.assertIn('\\84;', encoded_str) + + @mock.patch.object(text_encoder, '_ESCAPE_CHARS', new=set('\\_;13579')) + def test_raises_exception_when_not_encodable(self): + corpus = 'the quick brown fox jumps over the lazy dog' + token_counts = collections.Counter(corpus.split(' ')) + + # Deliberately exclude some required encoding chars from the alphabet + # and token list, making some strings unencodable. + encoder = text_encoder.SubwordTextEncoder.build_to_target_size( + 100, token_counts, 2, 10) + original = 'This has UPPER CASE letters that are out of alphabet' + + # Previously there was a bug which produced an infinite loop in this case. + with self.assertRaises(AssertionError): + encoder.encode(original) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py index 2b1cf572c..0e8daa75f 100644 --- a/tensor2tensor/data_generators/tokenizer.py +++ b/tensor2tensor/data_generators/tokenizer.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -29,7 +30,7 @@ alphanumeric character and a non-alphanumeric character. This produces a list which alternates between "alphanumeric tokens" (strings of alphanumeric characters) and "non-alphanumeric tokens" - (strings of of non-alphanumeric characters). + (strings of non-alphanumeric characters). 2. Remove every token consisting of a single space, unless it is the very first or very last token in the list. These tokens are now @@ -43,28 +44,25 @@ from __future__ import division from __future__ import print_function -from collections import defaultdict +import collections import sys import unicodedata # Dependency imports -from six import PY2 -from six import unichr # pylint: disable=redefined-builtin +import six from six.moves import xrange # pylint: disable=redefined-builtin - import tensorflow as tf - # Conversion between Unicode and UTF-8, if required (on Python2) -_native_to_unicode = (lambda s: s.decode("utf-8")) if PY2 else (lambda s: s) +_native_to_unicode = (lambda s: s.decode("utf-8")) if six.PY2 else (lambda s: s) # This set contains all letter and number characters. _ALPHANUMERIC_CHAR_SET = set( - unichr(i) for i in xrange(sys.maxunicode) - if (unicodedata.category(unichr(i)).startswith("L") or - unicodedata.category(unichr(i)).startswith("N"))) + six.unichr(i) for i in xrange(sys.maxunicode) + if (unicodedata.category(six.unichr(i)).startswith("L") or + unicodedata.category(six.unichr(i)).startswith("N"))) def encode(text): @@ -100,51 +98,95 @@ def decode(tokens): Returns: a unicode string """ - ret = u"" token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens] + ret = [] for i, token in enumerate(tokens): if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]: - ret += u" " - ret += token - return ret + ret.append(u" ") + ret.append(token) + return "".join(ret) + +def _read_filepattern(filepattern, max_lines=None, split_on_newlines=True): + """Reads files matching a wildcard pattern, yielding the contents. -def corpus_token_counts(text_filepattern, corpus_max_lines, - split_on_newlines=True): + Args: + filepattern: A wildcard pattern matching one or more files. + max_lines: If set, stop reading after reading this many lines. + split_on_newlines: A boolean. If true, then split files by lines and strip + leading and trailing whitespace from each line. Otherwise, treat each + file as a single string. + + Yields: + The contents of the files as lines, if split_on_newlines is True, or + the entire contents of each file if False. + """ + filenames = sorted(tf.gfile.Glob(filepattern)) + lines_read = 0 + for filename in filenames: + with tf.gfile.Open(filename) as f: + if split_on_newlines: + for line in f: + yield line.strip() + lines_read += 1 + if max_lines and lines_read >= max_lines: + return + + else: + if max_lines: + doc = [] + for line in f: + doc.append(line) + lines_read += 1 + if max_lines and lines_read >= max_lines: + yield "".join(doc) + return + yield "".join(doc) + + else: + yield f.read() + + +def corpus_token_counts( + text_filepattern, corpus_max_lines, split_on_newlines=True): """Read the corpus and compute a dictionary of token counts. Args: - text_filepattern: a pattern matching one or more files - corpus_max_lines: an integer - maximum total lines to read. - split_on_newlines: a boolean. If true, then split files by lines and strip - leading and trailing whitespace from each line. + text_filepattern: A pattern matching one or more files. + corpus_max_lines: An integer; maximum total lines to read. + split_on_newlines: A boolean. If true, then split files by lines and strip + leading and trailing whitespace from each line. Otherwise, treat each + file as a single string. Returns: - a dictionary from token to count. + a dictionary mapping token to count. """ - def read_corpus(): - """Read the corpus.""" - docs = [] - lines_read = 0 - filenames = tf.gfile.Glob(text_filepattern) - for text_filename in filenames: - with tf.gfile.Open(text_filename) as f: - if not split_on_newlines: - docs.append("") - for line in f: - if split_on_newlines: - # The tokenizer updates token_counts in encode() - docs.append(line.strip()) - else: - docs[-1] += line - lines_read += 1 - if corpus_max_lines > 0 and lines_read > corpus_max_lines: - return docs - return docs - - counts = defaultdict(int) - for doc in read_corpus(): - for tok in encode(_native_to_unicode(doc)): - counts[tok] += 1 + counts = collections.Counter() + for doc in _read_filepattern( + text_filepattern, + max_lines=corpus_max_lines, + split_on_newlines=split_on_newlines): + counts.update(encode(_native_to_unicode(doc))) + return counts + +def vocab_token_counts(text_filepattern, max_lines): + """Read a vocab file and return a dictionary of token counts. + + Reads a two-column CSV file of tokens and their frequency in a dataset. The + tokens are presumed to be generated by encode() or the equivalent. + + Args: + text_filepattern: A pattern matching one or more files. + max_lines: An integer; maximum total lines to read. + + Returns: + a dictionary mapping token to count. + """ + ret = {} + for line in _read_filepattern(text_filepattern, max_lines=max_lines): + token, count = line.rsplit(",", 1) + ret[_native_to_unicode(token)] = int(count) + + return ret diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py index c279290ed..0c299bd0b 100644 --- a/tensor2tensor/data_generators/tokenizer_test.py +++ b/tensor2tensor/data_generators/tokenizer_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,45 +20,126 @@ from __future__ import division from __future__ import print_function +import os import random # Dependency imports -from six import unichr # pylint: disable=redefined-builtin +import six from six.moves import xrange # pylint: disable=redefined-builtin from tensor2tensor.data_generators import tokenizer - import tensorflow as tf +FLAGS = tf.flags.FLAGS + +pkg_dir, _ = os.path.split(__file__) +_TESTDATA = os.path.join(pkg_dir, "test_data") + class TokenizerTest(tf.test.TestCase): - def testEncode(self): - self.assertEqual( - tokenizer.encode(u"Dude - that's so cool."), - [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]) - self.assertEqual( - tokenizer.encode(u"Łukasz est né en 1981."), - [u"Łukasz", u"est", u"né", u"en", u"1981", u"."]) - self.assertEqual( - tokenizer.encode(u" Spaces at the ends "), - [u" ", u"Spaces", u"at", u"the", u"ends", u" "]) - self.assertEqual(tokenizer.encode(u"802.11b"), [u"802", u".", u"11b"]) - self.assertEqual(tokenizer.encode(u"two. \nlines"), - [u"two", u". \n", u"lines"]) + def test_encode(self): + self.assertListEqual( + [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."], + tokenizer.encode(u"Dude - that's so cool.")) + self.assertListEqual([u"Łukasz", u"est", u"né", u"en", u"1981", u"."], + tokenizer.encode(u"Łukasz est né en 1981.")) + self.assertListEqual([u" ", u"Spaces", u"at", u"the", u"ends", u" "], + tokenizer.encode(u" Spaces at the ends ")) + self.assertListEqual([u"802", u".", u"11b"], tokenizer.encode(u"802.11b")) + self.assertListEqual([u"two", u". \n", u"lines"], + tokenizer.encode(u"two. \nlines")) - def testDecode(self): + def test_decode(self): self.assertEqual( + u"Dude - that's so cool.", tokenizer.decode( - [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]), - u"Dude - that's so cool.") + [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."])) - def testInvertibilityOnRandomStrings(self): - random.seed(123) + def test_invertibility_on_random_strings(self): for _ in xrange(1000): - s = u"".join([unichr(random.randint(0, 65535)) for _ in xrange(10)]) + s = u"".join(six.unichr(random.randint(0, 65535)) for _ in xrange(10)) self.assertEqual(s, tokenizer.decode(tokenizer.encode(s))) +class TestTokenCounts(tf.test.TestCase): + + def setUp(self): + super(TestTokenCounts, self).setUp() + self.corpus_path = os.path.join(_TESTDATA, "corpus-*.txt") + self.vocab_path = os.path.join(_TESTDATA, "vocab-*.txt") + + def test_corpus_token_counts_split_on_newlines(self): + token_counts = tokenizer.corpus_token_counts( + self.corpus_path, corpus_max_lines=0, split_on_newlines=True) + + expected = { + u"'": 2, + u".": 2, + u". ": 1, + u"... ": 1, + u"Groucho": 1, + u"Marx": 1, + u"Mitch": 1, + u"Hedberg": 1, + u"I": 3, + u"in": 2, + u"my": 2, + u"pajamas": 2, + } + self.assertDictContainsSubset(expected, token_counts) + self.assertNotIn(u".\n\n", token_counts) + self.assertNotIn(u"\n", token_counts) + + def test_corpus_token_counts_no_split_on_newlines(self): + token_counts = tokenizer.corpus_token_counts( + self.corpus_path, corpus_max_lines=0, split_on_newlines=False) + + self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts) + + def test_corpus_token_counts_split_with_max_lines(self): + token_counts = tokenizer.corpus_token_counts( + self.corpus_path, corpus_max_lines=5, split_on_newlines=True) + + self.assertIn(u"slept", token_counts) + self.assertNotIn(u"Mitch", token_counts) + + def test_corpus_token_counts_no_split_with_max_lines(self): + token_counts = tokenizer.corpus_token_counts( + self.corpus_path, corpus_max_lines=5, split_on_newlines=False) + + self.assertIn(u"slept", token_counts) + self.assertNotIn(u"Mitch", token_counts) + self.assertDictContainsSubset({ + u".\n\n": 1, + u"\n": 2, + u".\n": 1 + }, token_counts) + + def test_vocab_token_counts(self): + token_counts = tokenizer.vocab_token_counts(self.vocab_path, 0) + + expected = { + u"lollipop": 8, + u"reverberated": 12, + u"kattywampus": 11, + u"balderdash": 10, + u"jiggery-pokery": 14, + } + self.assertDictEqual(expected, token_counts) + + def test_vocab_token_counts_with_max_lines(self): + # vocab-1 has 2 lines, vocab-2 has 3 + token_counts = tokenizer.vocab_token_counts(self.vocab_path, 4) + + expected = { + u"lollipop": 8, + u"reverberated": 12, + u"kattywampus": 11, + u"balderdash": 10, + } + self.assertDictEqual(expected, token_counts) + + if __name__ == "__main__": tf.test.main() diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py index 8f905aa96..49147962a 100644 --- a/tensor2tensor/data_generators/wiki.py +++ b/tensor2tensor/data_generators/wiki.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 456970e62..97b191096 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -27,6 +28,7 @@ from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder from tensor2tensor.data_generators import wsj_parsing +from tensor2tensor.utils import metrics from tensor2tensor.utils import registry import tensorflow as tf @@ -57,9 +59,9 @@ def train_generator(self, data_dir, tmp_dir, is_training): """Generator of the training data.""" raise NotImplementedError() - def dev_generator(self, data_dir, tmp_dir, is_training): + def dev_generator(self, data_dir, tmp_dir): """Generator of the development data.""" - return self.train_generator(data_dir, tmp_dir, is_training) + return self.train_generator(data_dir, tmp_dir, False) @property def input_space_id(self): @@ -81,26 +83,24 @@ def vocab_name(self): def vocab_file(self): return "%s.%d" % (self.vocab_name, self.targeted_vocab_size) - def generate_data(self, data_dir, tmp_dir, num_shards=None): - if num_shards is None: - num_shards = self.num_shards + def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( - self.train_generator(data_dir, tmp_dir, True), - self.training_filepaths(data_dir, num_shards, shuffled=False), - self.dev_generator(data_dir, tmp_dir, False), - self.dev_filepaths(data_dir, 1, shuffled=False)) + self.train_generator(data_dir, tmp_dir, True), + self.training_filepaths(data_dir, self.num_shards, shuffled=False), + self.dev_generator(data_dir, tmp_dir), + self.dev_filepaths(data_dir, 1, shuffled=False)) def feature_encoders(self, data_dir): if self.is_character_level: return { - "inputs": text_encoder.ByteTextEncoder(), - "targets": text_encoder.ByteTextEncoder(), + "inputs": text_encoder.ByteTextEncoder(), + "targets": text_encoder.ByteTextEncoder(), } vocab_filename = os.path.join(data_dir, self.vocab_file) subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) return { - "inputs": subtokenizer, - "targets": subtokenizer, + "inputs": subtokenizer, + "targets": subtokenizer, } def hparams(self, defaults, unused_model_hparams): @@ -119,6 +119,13 @@ def hparams(self, defaults, unused_model_hparams): if self.is_character_level: p.loss_multiplier = 2.0 + def eval_metrics(self): + return [ + metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, + metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY, + metrics.Metrics.APPROX_BLEU + ] + # Generic generators used later for multiple problems. @@ -265,7 +272,7 @@ def bi_vocabs_token_generator(source_path, ] _ENDE_TEST_DATASETS = [ [ - "http://data.statmt.org/wmt16/translation-task/dev.tgz", + "http://data.statmt.org/wmt17/translation-task/dev.tgz", ("dev/newstest2013.en", "dev/newstest2013.de") ], ] @@ -295,7 +302,7 @@ def bi_vocabs_token_generator(source_path, ] _ENFR_TEST_DATASETS = [ [ - "http://data.statmt.org/wmt16/translation-task/dev.tgz", + "http://data.statmt.org/wmt17/translation-task/dev.tgz", ("dev/newstest2013.en", "dev/newstest2013.fr") ], ] @@ -328,7 +335,7 @@ def bi_vocabs_token_generator(source_path, # English-Czech datasets _ENCS_TRAIN_DATASETS = [ [ - "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long + "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long ("training-parallel-nc-v11/news-commentary-v11.cs-en.en", "training-parallel-nc-v11/news-commentary-v11.cs-en.cs") ], @@ -343,7 +350,7 @@ def bi_vocabs_token_generator(source_path, ] _ENCS_TEST_DATASETS = [ [ - "http://data.statmt.org/wmt16/translation-task/dev.tgz", + "http://data.statmt.org/wmt17/translation-task/dev.tgz", ("dev/newstest2013.en", "dev/newstest2013.cs") ], ] @@ -433,7 +440,8 @@ def train_generator(self, data_dir, tmp_dir, train): datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) - return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, EOS) @property def input_space_id(self): @@ -466,7 +474,7 @@ def train_generator(self, tmp_dir, train): tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag) return character_generator(data_path + ".lang1", data_path + ".lang2", - character_vocab, EOS) + character_vocab, EOS) @property def input_space_id(self): @@ -500,7 +508,7 @@ def train_generator(self, data_dir, tmp_dir, train): tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", - source_vocab, target_vocab, EOS) + source_vocab, target_vocab, EOS) @property def input_space_id(self): @@ -540,13 +548,14 @@ class WMTEnFrTokens8k(WMTProblem): def targeted_vocab_size(self): return 2**13 # 8192 - def train_generator(self, tmp_dir, train): + def train_generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size) datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) - return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, EOS) @property def input_space_id(self): @@ -579,7 +588,7 @@ def train_generator(self, data_dir, tmp_dir, train): tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag) return character_generator(data_path + ".lang1", data_path + ".lang2", - character_vocab, EOS) + character_vocab, EOS) @property def input_space_id(self): @@ -612,7 +621,7 @@ def train_generator(self, data_dir, tmp_dir, train): tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) return token_generator(data_path + ".lang1", data_path + ".lang2", - symbolizer_vocab, EOS) + symbolizer_vocab, EOS) @property def input_space_id(self): @@ -622,6 +631,7 @@ def input_space_id(self): def target_space_id(self): return problem.SpaceID.EN_TOK + @registry.register_problem("wmt_encs_tokens_32k") class WMTEnCsTokens32k(problem.Problem): """Problem spec for WMT English-Czech translation.""" @@ -644,7 +654,7 @@ def train_generator(self, data_dir, tmp_dir, train): tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_encs_tok_%s" % tag) return token_generator(data_path + ".lang1", data_path + ".lang2", - symbolizer_vocab, EOS) + symbolizer_vocab, EOS) @property def input_space_id(self): @@ -654,6 +664,13 @@ def input_space_id(self): def target_space_id(self): return problem.SpaceID.CS_TOK + def eval_metrics(self): + return [ + metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, + metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY, + metrics.Metrics.APPROX_BLEU + ] + @registry.register_problem("wmt_encs_characters") class WMTEnCsCharacters(WMTProblem): @@ -669,7 +686,7 @@ def train_generator(self, data_dir, tmp_dir, train): tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_encs_chr_%s" % tag) return character_generator(data_path + ".lang1", data_path + ".lang2", - character_vocab, EOS) + character_vocab, EOS) @property def input_space_id(self): @@ -680,15 +697,6 @@ def target_space_id(self): return problem.SpaceID.CS_CHR -# TODO This function is not used anywhere. -def parsing_character_generator(tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() - filename = "parsing_%s" % ("train" if train else "dev") - text_filepath = os.path.join(tmp_dir, filename + ".text") - tags_filepath = os.path.join(tmp_dir, filename + ".tags") - return character_generator(text_filepath, tags_filepath, character_vocab, EOS) - - def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, source_vocab_size, target_vocab_size): """Generate source and target data from a single file.""" diff --git a/tensor2tensor/data_generators/wmt_test.py b/tensor2tensor/data_generators/wmt_test.py index 86b88e5b1..441ceef59 100644 --- a/tensor2tensor/data_generators/wmt_test.py +++ b/tensor2tensor/data_generators/wmt_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/data_generators/wsj_parsing.py b/tensor2tensor/data_generators/wsj_parsing.py index 200754e16..4b1dbdd80 100644 --- a/tensor2tensor/data_generators/wsj_parsing.py +++ b/tensor2tensor/data_generators/wsj_parsing.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py index eff6a2b14..3f714ce1f 100644 --- a/tensor2tensor/models/__init__.py +++ b/tensor2tensor/models/__init__.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py index 752de038e..3b874555f 100644 --- a/tensor2tensor/models/attention_lm.py +++ b/tensor2tensor/models/attention_lm.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index 2754e8366..4b37050bb 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/bluenet.py b/tensor2tensor/models/bluenet.py index 95216f43d..3ac477e4b 100644 --- a/tensor2tensor/models/bluenet.py +++ b/tensor2tensor/models/bluenet.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/bluenet_test.py b/tensor2tensor/models/bluenet_test.py index b3f18249d..d4ce85b1a 100644 --- a/tensor2tensor/models/bluenet_test.py +++ b/tensor2tensor/models/bluenet_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py index 301626dc2..28862e594 100644 --- a/tensor2tensor/models/bytenet.py +++ b/tensor2tensor/models/bytenet.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py index f1e42669e..738b84251 100644 --- a/tensor2tensor/models/bytenet_test.py +++ b/tensor2tensor/models/bytenet_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py index c8b4a6068..1a8b2c79d 100644 --- a/tensor2tensor/models/common_attention.py +++ b/tensor2tensor/models/common_attention.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -65,9 +66,6 @@ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) - signal = tf.reshape(signal, [length, 2, num_timescales]) - signal = tf.transpose(signal, perm=[0, 2, 1]) - signal = tf.reshape(signal, [length, channels]) signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]]) signal = tf.reshape(signal, [1, length, channels]) return x + signal @@ -434,6 +432,91 @@ def local(x): return output +def unmasked_local_attention_1d(q, k, v, block_length=128, filter_width=100, + name=None): + """strided block local self-attention. + + Args: + q: a Tensor with shape [batch, heads, length, depth_k] + k: a Tensor with shape [batch, heads, length, depth_k] + v: a Tensor with shape [batch, heads, length, depth_v] + block_length: an integer + filter_width: an integer indicating how much to look left. + name: an optional string + + Returns: + a Tensor of shape [batch, heads, length, depth_v] + """ + with tf.variable_scope(name, default_name="local_self_attention_1d", + values=[q, k, v]): + v_shape = v.get_shape() + depth_v = tf.shape(v)[3] + batch_size = tf.shape(q)[0] + num_heads = tf.shape(q)[1] + original_length = tf.shape(q)[2] + # making sure q is a multiple of d + def pad_to_multiple(x, pad_length): + x_length = tf.shape(x)[2] + return tf.pad(x, [[0, 0], [0, 0], [0, -x_length % pad_length], [0, 0]]) + def pad_l_and_r(x, pad_length): + return tf.pad(x, [[0, 0], [0, 0], [pad_length, pad_length], [0, 0]]) + q = pad_to_multiple(q, block_length) + k = pad_to_multiple(k, block_length) + v = pad_to_multiple(v, block_length) + + # Setting up q blocks + new_q_shape = tf.shape(q) + # Setting up q blocks + q = tf.reshape(q, [new_q_shape[0], new_q_shape[1], + new_q_shape[2]//block_length, + block_length, new_q_shape[3]]) + + # Setting up k and v values + k = pad_l_and_r(k, filter_width) + v = pad_l_and_r(v, filter_width) + + length = tf.shape(k)[2] + full_filter_width = block_length + 2*filter_width + # getting gather indices + indices = tf.range(0, length, delta=1, name="index_range") + # making indices [1, length, 1] to appy convs + indices = tf.reshape(indices, [1, -1, 1]) + kernel = tf.expand_dims(tf.eye(full_filter_width), axis=1) + gather_indices = tf.nn.conv1d( + tf.cast(indices, tf.float32), + kernel, + block_length, + padding="VALID", + name="gather_conv") + + gather_indices = tf.squeeze(tf.cast(gather_indices, tf.int32), axis=0) + + # [length, batch, heads, dim] + k_t = tf.transpose(k, [2, 0, 1, 3]) + k_new = tf.gather(k_t, gather_indices) + + # [batch, heads, blocks, block_length, dim] + k_new = tf.transpose(k_new, [2, 3, 0, 1, 4]) + + attention_bias = tf.expand_dims( + tf.to_float(embedding_to_padding(k_new)) * -1e9, axis=-2) + + v_t = tf.transpose(v, [2, 0, 1, 3]) + v_new = tf.gather(v_t, gather_indices) + v_new = tf.transpose(v_new, [2, 3, 0, 1, 4]) + + logits = tf.matmul(q, k_new, transpose_b=True) + + attention = tf.nn.softmax(logits+attention_bias) + output = tf.matmul(attention, v_new) + + output = tf.reshape(output, [batch_size, num_heads, -1, depth_v]) + # Remove the padding if introduced + output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1]) + output.set_shape(v_shape) + return output + + def multihead_attention(query_antecedent, memory_antecedent, bias, @@ -459,8 +542,9 @@ def multihead_attention(query_antecedent, dropout_rate: a floating point number image_shapes: optional tuple of integer scalars. see comments for attention_image_summary() - attention_type: a string, either "dot_product" or "local_mask_right" - block_length: an integer - relevent for "local_mask_right" + attention_type: a string, either "dot_product" or "local_mask_right" or + "local_unmasked" + block_length: an integer - relevant for "local_mask_right" name: an optional string Returns: @@ -508,9 +592,11 @@ def multihead_attention(query_antecedent, if attention_type == "dot_product": x = dot_product_attention( q, k, v, bias, dropout_rate, image_shapes) - else: - assert attention_type == "local_mask_right" + elif attention_type == "local_mask_right": x = masked_local_attention_1d(q, k, v, block_length=block_length) + else: + assert attention_type == "local_unmasked" + x = unmasked_local_attention_1d(q, k, v, block_length=block_length) x = combine_heads(x) x = common_layers.conv1d(x, output_depth, 1, name="output_transform") return x @@ -651,4 +737,5 @@ def parameter_attention(x, y = tf.reshape(y, [batch_size, length, total_value_depth]) y.set_shape([None, None, total_value_depth]) y = common_layers.conv1d(y, output_depth, 1, name="output_transform") + return y diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py index ff856968b..e36b2e4e1 100644 --- a/tensor2tensor/models/common_hparams.py +++ b/tensor2tensor/models/common_hparams.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -49,6 +50,8 @@ def basic_params1(): # when not in training mode. dropout=0.2, clip_grad_norm=2.0, + grad_noise_scale=0.0, + summarize_grads=int(False), initializer="orthogonal", initializer_gain=1.5, label_smoothing=0.1, diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py index 638535aa2..e98531d88 100644 --- a/tensor2tensor/models/common_layers.py +++ b/tensor2tensor/models/common_layers.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -30,7 +31,7 @@ from tensorflow.python.framework import function # This is a global setting. When turned off, no @function.Defun is used. -allow_defun = True +allow_defun = False def saturating_sigmoid(x): @@ -468,7 +469,10 @@ def get_norm(norm_type): "'noam', 'none'.") -def residual_fn(x, y, norm_type, residual_dropout, +def residual_fn(x, + y, + norm_type, + residual_dropout, filters=None, epsilon=1e-16, name="residual"): @@ -558,11 +562,17 @@ def conv_block_internal(conv_fn, def conv_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs): - """A block of standard convolutions.""" + """A block of standard 2d convolutions.""" return conv_block_internal(conv, inputs, filters, dilation_rates_and_kernel_sizes, **kwargs) +def conv1d_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs): + """A block of standard 1d convolutions.""" + return conv_block_internal(conv1d, inputs, filters, + dilation_rates_and_kernel_sizes, **kwargs) + + def separable_conv_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs): """A block of separable convolutions.""" @@ -857,10 +867,7 @@ def multiscale_conv_sum(inputs, output_size, dilation_rates_and_kernel_sizes, return tf.add_n(results) * (len(results)**-0.5) -def multiscale_conv_and_attention(x, - padding, - hparams, - source=None): +def multiscale_conv_and_attention(x, padding, hparams, source=None): """A common part of t2t layers. First, do a linear multiscale convolution @@ -924,10 +931,7 @@ def conv_with_pools(inputs, output_size, kernel_size, pool_sizes, pooling_type, return tf.add_n(results) * (len(results)**-0.5) -def conv_with_pools_and_attention(x, - padding, - hparams, - source=None): +def conv_with_pools_and_attention(x, padding, hparams, source=None): """A common part of t2t layers. First, do conv_with_pools @@ -1388,8 +1392,8 @@ def padded_cross_entropy(logits, vocab_size = tf.shape(logits)[-1] with tf.name_scope("padded_cross_entropy", [logits, labels]): pad_logits, pad_labels = pad_with_zeros(logits, labels) - xent = smoothing_cross_entropy(pad_logits, pad_labels, - vocab_size, confidence) + xent = smoothing_cross_entropy(pad_logits, pad_labels, vocab_size, + confidence) weights = weights_fn(pad_labels) if not reduce_sum: return xent * weights, weights @@ -1492,8 +1496,8 @@ def linear_set_layer(layer_size, # Unfortunately tf doesn't support broadcasting via concat, but we can # simply add the transformed context to get the same effect. context = tf.expand_dims(context, axis=1) - cont_tfm = conv1d(context, layer_size, 1, - activation=None, name="cont_conv") + cont_tfm = conv1d( + context, layer_size, 1, activation=None, name="cont_conv") outputs += cont_tfm if activation_fn is not None: diff --git a/tensor2tensor/models/common_layers_test.py b/tensor2tensor/models/common_layers_test.py index 3a2fafd8b..8e724587b 100644 --- a/tensor2tensor/models/common_layers_test.py +++ b/tensor2tensor/models/common_layers_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/gene_expression.py b/tensor2tensor/models/gene_expression.py new file mode 100644 index 000000000..bdb93509b --- /dev/null +++ b/tensor2tensor/models/gene_expression.py @@ -0,0 +1,132 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Models for gene expression from DNA.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.models import common_hparams +from tensor2tensor.models import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +@registry.register_model +class GeneExpressionConv(t2t_model.T2TModel): + """Gene expression conv net. + + Based on "Basenji" model from + http://www.biorxiv.org/content/early/2017/07/10/161851 + + Uses layer_norm instead of batch_norm. + """ + + def model_fn_body(self, features): + inputs = features["inputs"] + inputs.get_shape().assert_has_rank(4) + + hp = self._hparams + + out = inputs + out = common_layers.flatten4d3d(out) + + # Conv layers + for i in xrange(hp.num_conv_layers): + out = conv_layer( + out, + hp.hidden_size, + hp.kernel_width, + hp.stride, + hp.pooling_windows[i], + hp.dropout, + 1, + name="conv_%d" % (i + 1)) + + # Dense dilated conv layers + for i in xrange(hp.num_dconv_layers): + dilation_rate = 2**(i + 1) + dconv_out = conv_layer( + out, + hp.hidden_size, + hp.kernel_width, + 1, + 0, + hp.dropout, + dilation_rate, + name="dconv_%d" % (i + 1)) + out = tf.concat([out, dconv_out], axis=2) + + # Fully connected layer + out = fc_layer(out, hp.hidden_size, hp.dropout, name="fc") + + out.get_shape().assert_has_rank(3) + out = tf.expand_dims(out, 2) + return out + + +def conv_layer(x, + hidden_size, + kernel_size, + stride, + pooling_window, + dropout_rate, + dilation_rate, + name="conv"): + with tf.variable_scope(name): + out = x + out = common_layers.conv1d_block( + out, + hidden_size, [(dilation_rate, kernel_size)], + strides=stride, + first_relu=False, + padding="same") + out = tf.nn.relu(out) + if pooling_window: + out = tf.layers.max_pooling1d( + out, pooling_window, pooling_window, padding="same") + out = tf.layers.dropout(out, dropout_rate) + return out + + +def fc_layer(x, num_out, dropout_rate, name="fc"): + with tf.variable_scope(name): + out = x + out = tf.layers.dense(out, num_out) + out = tf.contrib.layers.layer_norm(out) + out = tf.nn.relu(out) + out = tf.layers.dropout(out, dropout_rate) + return out + + +@registry.register_hparams +def gene_expression_conv_base(): + """Hparams for GeneExpressionConv model.""" + hparams = common_hparams.basic_params1() + hparams.add_hparam("num_conv_layers", 4) + hparams.add_hparam("num_dconv_layers", 7) + hparams.add_hparam("pooling_windows", [2, 4, 4, 4]) + + # TODO(rsepassi): Correct the values of these hyperparameters + hparams.hidden_size = 128 + hparams.kernel_width = 128 + hparams.add_hparam("stride", 1) + return hparams diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py new file mode 100644 index 000000000..bec5268fd --- /dev/null +++ b/tensor2tensor/models/gene_expression_test.py @@ -0,0 +1,79 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Gene Expression models.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import gene_expression as gene_data +from tensor2tensor.models import gene_expression +from tensor2tensor.models import modalities # pylint: disable=unused-import + +import tensorflow as tf + + +def gene_expression_conv_test(): + hparams = gene_expression.gene_expression_conv_base() + hparams.hidden_size = 8 + hparams.num_dconv_layers = 2 + return hparams + + +class GeneExpressionModelsTest(tf.test.TestCase): + + def _testModel(self, hparams, model_cls): + batch_size = 3 + target_length = 6 + target_out = 10 # GeneExpressionProblem.num_output_predictions + input_length = target_length * 128 + input_vocab_size = 5 + + inputs = np.random.random_integers( + input_vocab_size, size=(batch_size, input_length, 1, 1)) + targets = np.random.random_sample((batch_size, target_length, 1, + target_out)) + + features = { + "inputs": tf.constant(inputs, dtype=tf.int32), + "targets": tf.constant(targets, dtype=tf.float32), + } + p_hparams, = hparams.problems + sharded_logits, _, _ = model_cls(hparams, tf.contrib.learn.ModeKeys.TRAIN, + p_hparams).model_fn(features) + logits = tf.concat(sharded_logits, 0) + + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + res = sess.run(logits) + + self.assertEqual(res.shape, (batch_size, target_length, 1, target_out)) + + def testGeneExpressionModels(self): + models_hparams = [(gene_expression.GeneExpressionConv, + gene_expression_conv_test())] + for model_cls, hparams in models_hparams: + hparams.add_hparam("data_dir", None) + p_hparams = gene_data.GeneExpressionCAGE10().internal_hparams(hparams) + hparams.problems = [p_hparams] + self._testModel(hparams, model_cls) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/long_answer.py b/tensor2tensor/models/long_answer.py index 7bb6a4a55..be8024f63 100644 --- a/tensor2tensor/models/long_answer.py +++ b/tensor2tensor/models/long_answer.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py index c3ae0a01e..ae221bdff 100644 --- a/tensor2tensor/models/lstm.py +++ b/tensor2tensor/models/lstm.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py index 4ddaf6b64..1e542a666 100644 --- a/tensor2tensor/models/lstm_test.py +++ b/tensor2tensor/models/lstm_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py index 60df80a1c..20464c0a2 100644 --- a/tensor2tensor/models/modalities.py +++ b/tensor2tensor/models/modalities.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -165,7 +166,7 @@ def targets_bottom(self, inputs): def top(self, body_output, _): with tf.variable_scope("rgb_softmax"): - # seperate embedding for each channel + # separate embedding for each channel # assuming the body output returns a tensor of shape # [batch_size, rows, cols, channels, self._body_input_depth] body_output_split = tf.split(body_output, self._channels, axis=3) @@ -180,12 +181,11 @@ def top(self, body_output, _): shape = tf.shape(body_output_split[i])[:-1] body_output = tf.reshape(body_output_split[i], [-1, self._body_input_depth]) - channel_logits = tf.matmul(body_output, - output_rgb_embedding_var[i], - transpose_b=True) - rgb_channel_logits.append(tf.reshape( - channel_logits, tf.concat([shape, [self.top_dimensionality]], - 0))) + channel_logits = tf.matmul( + body_output, output_rgb_embedding_var[i], transpose_b=True) + rgb_channel_logits.append( + tf.reshape(channel_logits, + tf.concat([shape, [self.top_dimensionality]], 0))) logits = tf.concat(rgb_channel_logits, axis=3) # Reshape logits to conform to CIFAR image shapes (32 by 32 by 3) @@ -467,6 +467,38 @@ def top(self, body_output, _): return body_output +@registry.register_generic_modality("real") +class RealModality(modality.Modality): + """Modality for real (i.e. float) vectors.""" + + def bottom(self, x): + with tf.variable_scope("real"): + return tf.layers.dense(x, self._body_input_depth) + + def top(self, body_output, _): + with tf.variable_scope("real"): + return tf.layers.dense(body_output, self._vocab_size) + + def top_sharded(self, + sharded_body_output, + sharded_targets, + data_parallelism, + weights_fn=common_layers.weights_nonzero): + sharded_predictions = data_parallelism(self.top, sharded_body_output, + sharded_targets) + + def l2_loss(predictions, targets): + with tf.name_scope("l2"): + weights = weights_fn(targets) + l2 = tf.pow(predictions - targets, 2) + return tf.reduce_sum(l2 * weights), tf.reduce_sum(weights) + + loss_num, loss_den = data_parallelism(l2_loss, sharded_predictions, + sharded_targets) + loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den)) + return sharded_predictions, loss + + @registry.register_image_modality("identity_no_pad") class IdentityModalityNoPad(modality.Modality): """Does nothing except making sure that there is no padding in cross-ent.""" diff --git a/tensor2tensor/models/modalities_test.py b/tensor2tensor/models/modalities_test.py index 118db3847..4254c6b04 100644 --- a/tensor2tensor/models/modalities_test.py +++ b/tensor2tensor/models/modalities_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py index 2cf639426..907a801cf 100644 --- a/tensor2tensor/models/models.py +++ b/tensor2tensor/models/models.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -26,6 +27,7 @@ from tensor2tensor.models import attention_lm_moe from tensor2tensor.models import bluenet from tensor2tensor.models import bytenet +from tensor2tensor.models import gene_expression from tensor2tensor.models import long_answer from tensor2tensor.models import lstm from tensor2tensor.models import modalities diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py index bf06dfd65..089889ce6 100644 --- a/tensor2tensor/models/multimodel.py +++ b/tensor2tensor/models/multimodel.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py index 958fac5d7..03990594b 100644 --- a/tensor2tensor/models/multimodel_test.py +++ b/tensor2tensor/models/multimodel_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py index 30d535098..fc9d75639 100644 --- a/tensor2tensor/models/neural_gpu.py +++ b/tensor2tensor/models/neural_gpu.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py index 1dddc1056..3d1cc0562 100644 --- a/tensor2tensor/models/neural_gpu_test.py +++ b/tensor2tensor/models/neural_gpu_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/shake_shake.py b/tensor2tensor/models/shake_shake.py index 26d43afb3..7fa40783a 100644 --- a/tensor2tensor/models/shake_shake.py +++ b/tensor2tensor/models/shake_shake.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py index 2ad4c89d1..69e2338b6 100644 --- a/tensor2tensor/models/slicenet.py +++ b/tensor2tensor/models/slicenet.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py index 911953445..692799571 100644 --- a/tensor2tensor/models/slicenet_test.py +++ b/tensor2tensor/models/slicenet_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index c693d1ca3..c45e88577 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -45,8 +46,8 @@ def model_fn_body(self, features): # Remove dropout if not training hparams = copy.copy(self._hparams) targets = features["targets"] - inputs = features.get("inputs") - target_space = features.get("target_space_id") + inputs = features["inputs"] + target_space = features["target_space_id"] inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) diff --git a/tensor2tensor/models/transformer_alternative.py b/tensor2tensor/models/transformer_alternative.py index 280dbc713..62413c325 100644 --- a/tensor2tensor/models/transformer_alternative.py +++ b/tensor2tensor/models/transformer_alternative.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py index 997b5d172..a7f1fc9ae 100644 --- a/tensor2tensor/models/transformer_test.py +++ b/tensor2tensor/models/transformer_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py index d3c5a2690..61fa61235 100644 --- a/tensor2tensor/models/xception.py +++ b/tensor2tensor/models/xception.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py index aa5c1c034..bf434aeac 100644 --- a/tensor2tensor/models/xception_test.py +++ b/tensor2tensor/models/xception_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/__init__.py b/tensor2tensor/utils/__init__.py index eff6a2b14..3f714ce1f 100644 --- a/tensor2tensor/utils/__init__.py +++ b/tensor2tensor/utils/__init__.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/avg_checkpoints.py b/tensor2tensor/utils/avg_checkpoints.py index a84750310..77acd4353 100644 --- a/tensor2tensor/utils/avg_checkpoints.py +++ b/tensor2tensor/utils/avg_checkpoints.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py index 3a511907d..dd8275204 100644 --- a/tensor2tensor/utils/beam_search.py +++ b/tensor2tensor/utils/beam_search.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/beam_search_test.py b/tensor2tensor/utils/beam_search_test.py index e084f1f0e..5223989ea 100644 --- a/tensor2tensor/utils/beam_search_test.py +++ b/tensor2tensor/utils/beam_search_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py index 155b10c72..20a7c8426 100644 --- a/tensor2tensor/utils/bleu_hook.py +++ b/tensor2tensor/utils/bleu_hook.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -91,7 +92,6 @@ def compute_bleu(reference_corpus, matches_by_order[len(ngram) - 1] += overlap[ngram] for ngram in translation_ngram_counts: possible_matches_by_order[len(ngram)-1] += translation_ngram_counts[ngram] - precisions = [0] * max_order for i in xrange(0, max_order): if possible_matches_by_order[i] > 0: @@ -106,7 +106,6 @@ def compute_bleu(reference_corpus, if use_bp: ratio = translation_length / reference_length bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0 - bleu = geo_mean * bp return np.float32(bleu) @@ -127,8 +126,8 @@ def bleu_score(predictions, labels, **unused_kwargs): """ outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) # Convert the outputs and labels to a [batch_size, input_length] tensor. - outputs = tf.squeeze(outputs) - labels = tf.squeeze(labels) + outputs = tf.squeeze(outputs, axis=[-1, -2]) + labels = tf.squeeze(labels, axis=[-1, -2]) bleu = tf.py_func(compute_bleu, (labels, outputs), tf.float32) return bleu, tf.constant(1.0) diff --git a/tensor2tensor/utils/bleu_hook_test.py b/tensor2tensor/utils/bleu_hook_test.py index 8092ab979..bf08174f8 100644 --- a/tensor2tensor/utils/bleu_hook_test.py +++ b/tensor2tensor/utils/bleu_hook_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index cd8e6c2d3..24dd31485 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -137,10 +138,12 @@ def preprocessing(examples, data_file_pattern, mode): # Small single-example pre-processing for images. def resize(img, size): return tf.to_int64(tf.image.resize_images(img, [size, size])) + def preprocess(img): img = tf.image.resize_images(img, [360, 360]) img = common_layers.image_augmentation(tf.to_float(img) / 255.) return tf.to_int64(img * 255.) + if ("image_imagenet" in data_file_pattern or "image_mscoco" in data_file_pattern): examples["inputs"] = tf.cast(examples["inputs"], tf.int64) @@ -153,8 +156,8 @@ def preprocess(img): lambda img=inputs: resize(img, 299)) else: examples["inputs"] = tf.to_int64(resize(inputs, 299)) - elif ("image_cifar10" in data_file_pattern - and mode == tf.contrib.learn.ModeKeys.TRAIN): + elif ("image_cifar10" in data_file_pattern and + mode == tf.contrib.learn.ModeKeys.TRAIN): examples["inputs"] = common_layers.cifar_image_augmentation( examples["inputs"]) elif "img2img" in data_file_pattern: @@ -181,8 +184,62 @@ def preprocess(img): return examples -def input_pipeline(data_file_pattern, capacity, mode): +def problem_input_pipeline(problem, data_file_pattern, capacity, mode): + """Input pipeline for Problems.""" + data_fields, data_items_to_decoders = problem.example_reading_spec() + + # Create placeholders for input, rather than reading data from disk. + if data_file_pattern is None: + return feature_placeholders(data_fields) + + # Now the non-trivial case construction. + examples = examples_queue( + [data_file_pattern], + data_fields, + training=(mode == tf.contrib.learn.ModeKeys.TRAIN), + capacity=capacity, + data_items_to_decoders=data_items_to_decoders) + + examples = problem.preprocess_examples(examples, mode) + + # We do not want int64s as they are not supported on GPUs. + examples = cast_int64_to_int32(examples) + + return examples + + +def cast_int64_to_int32(features): + f = {} + for k, v in six.iteritems(features): + if v.dtype == tf.int64: + v = tf.to_int32(v) + f[k] = v + return f + + +def feature_placeholders(data_fields): + feature_map = {} + for (field, tp) in data_fields: + if not field.startswith("targets"): + feature_map[field] = tf.placeholder( + dtype=tp, shape=[None] * 4, name=field) + return feature_map + + +def input_pipeline(problem, data_file_pattern, capacity, mode): """Input pipeline, returns a dictionary of tensors from queues.""" + + if problem is not None: + # problem is not None when the problem is specified with the Problem API, + # which handles Example decoding and preprocessing. + # Otherwise the problem is specified in problem_hparams and is dealt with + # below. + # As problems are ported to the Problem API, the special handling here will + # need to be moved to Problem.example_reading_spec and + # Problem.preprocessing. + return problem_input_pipeline(problem, data_file_pattern, capacity, mode) + + data_items_to_decoders = None # Read from image TFRecords if the file has "image" in its name. if data_file_pattern and "image" in data_file_pattern: label_key = "image/class/label" @@ -210,22 +267,15 @@ def input_pipeline(data_file_pattern, capacity, mode): "audio/sample_width": tf.FixedLenFeature((), tf.int64), "targets": tf.VarLenFeature(tf.int64), } - data_items_to_decoders = None else: data_fields = { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) } - data_items_to_decoders = None # Create placeholders for input, rather than reading data from disk. if data_file_pattern is None: - feature_map = {} - for (field, tp) in data_fields: - if field != "targets": - feature_map[field] = tf.placeholder( - dtype=tp, shape=[None] * 4, name=field) - return feature_map + return feature_placeholders(data_fields) # Now the non-trivial case construction. examples = examples_queue( @@ -237,8 +287,9 @@ def input_pipeline(data_file_pattern, capacity, mode): examples = preprocessing(examples, data_file_pattern, mode) - # We do not want int64s as they do are not supported on GPUs. - return {k: tf.to_int32(v) for (k, v) in six.iteritems(examples)} + # We do not want int64s as they are not supported on GPUs. + examples = cast_int64_to_int32(examples) + return examples def batch_examples(examples, batching_scheme): diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py index 18507ed06..f0c318e7b 100644 --- a/tensor2tensor/utils/data_reader_test.py +++ b/tensor2tensor/utils/data_reader_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py index c3becbfb4..e21f2453a 100644 --- a/tensor2tensor/utils/expert_utils.py +++ b/tensor2tensor/utils/expert_utils.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py index cf66f6af8..ae9ce3882 100644 --- a/tensor2tensor/utils/metrics.py +++ b/tensor2tensor/utils/metrics.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,8 +18,6 @@ from __future__ import division from __future__ import print_function -import functools - # Dependency imports import six @@ -28,7 +27,24 @@ import tensorflow as tf -FLAGS = tf.flags.FLAGS + +class Metrics(object): + """Available evaluation metrics.""" + # Entries here should match the keys in METRICS_FN below + ACC = "accuracy" + ACC_TOP5 = "accuracy_top5" + ACC_PER_SEQ = "accuracy_per_sequence" + NEG_LOG_PERPLEXITY = "neg_log_perplexity" + APPROX_BLEU = "approx_bleu_score" + RMSE = "rmse" + + +def padded_rmse(predictions, labels, weights_fn=common_layers.weights_nonzero): + predictions, labels = common_layers.pad_with_zeros(predictions, labels) + targets = labels + weights = weights_fn(targets) + error = tf.sqrt(tf.pow(predictions - labels, 2)) + return tf.reduce_sum(error * weights), tf.reduce_sum(weights) def padded_accuracy_topk(predictions, @@ -97,62 +113,76 @@ def create_evaluation_metrics(problems): """Creates the evaluation metrics for the model. Args: - problems: List of strings containing the name of the problems. + problems: List of tuples (problem name, problem instance). Returns: A dictionary with keys that are strings naming the evaluation metrics and values that are functions taking arguments of (predictions, targets), returning a tuple of a tensor of the metric's value together with an op to update the metric's value. + + Raises: + ValueError: if the metrics specified by a problem are not recognized (i.e. + are not defined in the Metrics enum. """ - def append_metric_fns(metric_tup, eval_metrics): - """Append problem-specific and global metrics to eval_metrics.""" - metric_name, metric_function = metric_tup - def fn(predictions, labels, weights, idx, weights_fn): - # The 'weights' argument represents problem-choice here, - # we need to keep this name because MetricSpecs checks it. + def make_problem_specific_metric_fn(metric_fn, problem_idx, weights_fn): + """Create a metric fn conditioned on problem_idx.""" + + def problem_metric_fn(predictions, labels, weights): problem_choice = weights (scores, weights) = tf.cond( - tf.equal(idx, problem_choice), # pylint: disable=cell-var-from-loop - lambda: metric_function(predictions, labels, weights_fn=weights_fn), + tf.equal(problem_idx, problem_choice), + lambda: metric_fn(predictions, labels, weights_fn=weights_fn), lambda: (tf.constant(0.0), tf.constant(0.0))) # The tf.metrics.mean function assures correct aggregation. return tf.metrics.mean(scores, weights) - for i, problem in enumerate(problems): - name = "metrics-%s/%s" % (problem, metric_name) - class_output = "image" in problem and "coco" not in problem - weights_fn = (common_layers.weights_all if class_output - else common_layers.weights_nonzero) - eval_metrics[name] = functools.partial(fn, idx=i, weights_fn=weights_fn) - - def global_fn(predictions, labels, weights): - (scores, weights) = metric_function(predictions, labels) - return tf.metrics.mean(scores, weights) - - eval_metrics["metrics/%s" % metric_name] = global_fn + return problem_metric_fn eval_metrics = dict() - - # Metrics are functions that take predictions and labels and return - # a tensor of metrics and a tensor of weights. - # The results are passed to tf.metrics.mean to accumulate properly. - metrics_list = [("accuracy", padded_accuracy), ("accuracy_top5", - padded_accuracy_top5), - ("accuracy_per_sequence", padded_sequence_accuracy), - ("neg_log_perplexity", padded_neg_log_perplexity)] - - # TODO(nikip): Extend this to support use of custom metrics for problems. - for problem in problems: - if "wmt" in problem: - metrics_list.append(("approx_bleu_score", bleu_hook.bleu_score)) - - for metric in metrics_list: - append_metric_fns(metric, eval_metrics) + for problem_idx, (problem_name, problem_instance) in enumerate(problems): + if problem_instance is None: + # For problems in problem_hparams + metrics = [ + Metrics.ACC, Metrics.ACC_TOP5, Metrics.ACC_PER_SEQ, + Metrics.NEG_LOG_PERPLEXITY + ] + if "wmt" in problem_name: + metrics.append(Metrics.APPROX_BLEU) + else: + # For registered Problems + metrics = problem_instance.eval_metrics() + if not all([m in METRICS_FNS for m in metrics]): + raise ValueError("Unrecognized metric. Problem %s specified metrics " + "%s. Recognized metrics are %s." % + (problem_name, metrics, METRICS_FNS.keys())) + + class_output = "image" in problem_name and "coco" not in problem_name + weights_fn = (common_layers.weights_all + if class_output else common_layers.weights_nonzero) + + for metric in metrics: + metric_fn = METRICS_FNS[metric] + problem_metric_fn = make_problem_specific_metric_fn( + metric_fn, problem_idx, weights_fn) + eval_metrics["metrics-%s/%s" % (problem_name, metric)] = problem_metric_fn return { k: tf.contrib.learn.MetricSpec( v, prediction_key="predictions", weight_key="problem_choice") for (k, v) in six.iteritems(eval_metrics) } + + +# Metrics are functions that take predictions and labels and return +# a tensor of metrics and a tensor of weights. +# The results are passed to tf.metrics.mean to accumulate properly. +METRICS_FNS = { + Metrics.ACC: padded_accuracy, + Metrics.ACC_TOP5: padded_accuracy_top5, + Metrics.ACC_PER_SEQ: padded_sequence_accuracy, + Metrics.NEG_LOG_PERPLEXITY: padded_neg_log_perplexity, + Metrics.APPROX_BLEU: bleu_hook.bleu_score, + Metrics.RMSE: padded_rmse, +} diff --git a/tensor2tensor/utils/metrics_test.py b/tensor2tensor/utils/metrics_test.py index de72d797f..0d78e632c 100644 --- a/tensor2tensor/utils/metrics_test.py +++ b/tensor2tensor/utils/metrics_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py index 3ac6153b7..72169be1f 100644 --- a/tensor2tensor/utils/modality.py +++ b/tensor2tensor/utils/modality.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -42,7 +43,7 @@ class Modality(object): function targets_bottom represents the auto-regressive part of the network. It is applied to the already-generated part of an image, which is given to the decoder to generate the next part. In some cases, e.g., for text, it is - the same as the inputs_bottom function, as that is the default we use. But, + the same as the inputs_bottom function, and that is the default we use. But, e.g., for images, a different function might be needed to regress properly. All 3 functions have simple and sharded versions. A sub-class only needs diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py index 5a8823510..0baad2471 100644 --- a/tensor2tensor/utils/registry.py +++ b/tensor2tensor/utils/registry.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/registry_test.py b/tensor2tensor/utils/registry_test.py index 1f4436b0c..3231809ea 100644 --- a/tensor2tensor/utils/registry_test.py +++ b/tensor2tensor/utils/registry_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 2a271afbf..66e40d495 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -163,6 +164,8 @@ def infer(self, Returns: samples: an integer `Tensor`. """ + # TODO(rsepassi): Make decoding work with real-valued model outputs + # (i.e. if the target modality is RealModality). if not self.has_input: # since there is no input, it is more interesting to see randomly # generated sequences, than to see the most likely sequence repeatedly. @@ -499,5 +502,5 @@ def _warn_changed_modality_type(new_name, old_name, feature_name): old_type, old_name = registry.parse_modality_name(old_name) if new_type != old_type: tf.logging.warning("%s has a designated modality type %s (%s) but has been " - "overriden with a modality of type %s (%s).", + "overridden with a modality of type %s (%s).", feature_name, old_type, old_name, new_type, new_name) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 878dbe107..bf105c5ae 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -44,6 +45,7 @@ import tensorflow as tf from tensorflow.contrib.learn.python.learn import learn_runner +from tensorflow.python import debug from tensorflow.python.ops import init_ops # Number of samples to draw for an image input (in such cases as captioning) @@ -54,6 +56,8 @@ flags.DEFINE_bool("registry_help", False, "If True, logs the contents of the registry and exits.") +flags.DEFINE_bool("tfdbg", False, + "If True, use the TF debugger CLI on train/eval.") flags.DEFINE_string("output_dir", "", "Base output directory for run.") flags.DEFINE_string("model", "", "Which model to use.") flags.DEFINE_string("hparams_set", "", "Which parameters to use.") @@ -84,7 +88,7 @@ flags.DEFINE_integer("local_eval_frequency", 2000, "Run evaluation every this steps during local training.") flags.DEFINE_bool("locally_shard_to_cpu", False, - "Use CPU as a sharding device runnning locally. This allows " + "Use CPU as a sharding device running locally. This allows " "to test sharded model construction on a machine with 1 GPU.") flags.DEFINE_bool("daisy_chain_variables", True, "copy variables around in a daisy chain") @@ -102,6 +106,9 @@ flags.DEFINE_integer("ps_replicas", 0, "How many ps replicas.") # Decode flags +# Set one of {decode_from_dataset, decode_interactive, decode_from_file} to +# decode. +flags.DEFINE_bool("decode_from_dataset", False, "Decode from dataset on disk.") flags.DEFINE_bool("decode_use_last_position_only", False, "In inference, use last position only for speedup.") flags.DEFINE_bool("decode_interactive", False, @@ -121,6 +128,8 @@ "Whether to return 1 (False) or all (True) beams. The \n " "output file will have the format " "\t..\t") +flags.DEFINE_integer("decode_max_input_size", -1, + "Maximum number of ids in input. Or <= 0 for no max.") def _save_until_eos(hyp): @@ -149,21 +158,35 @@ def experiment_fn(output_dir): def create_experiment(output_dir, data_dir, model_name, train_steps, eval_steps): + """Create Experiment.""" hparams = create_hparams(FLAGS.hparams_set, data_dir) estimator, input_fns = create_experiment_components( hparams=hparams, output_dir=output_dir, data_dir=data_dir, model_name=model_name) + eval_metrics = metrics.create_evaluation_metrics( + zip(FLAGS.problems.split("-"), hparams.problem_instances)) + if (hasattr(FLAGS, "autotune") and FLAGS.autotune and + FLAGS.objective not in eval_metrics): + raise ValueError("Tuning objective %s not among evaluation metrics %s" % + (FLAGS.objective, eval_metrics.keys())) + train_monitors = [] + eval_hooks = [] + if FLAGS.tfdbg: + hook = debug.LocalCLIDebugHook() + train_monitors.append(hook) + eval_hooks.append(hook) return tf.contrib.learn.Experiment( estimator=estimator, train_input_fn=input_fns["train"], eval_input_fn=input_fns["eval"], - eval_metrics=metrics.create_evaluation_metrics(FLAGS.problems.split("-")), + eval_metrics=eval_metrics, train_steps=train_steps, eval_steps=eval_steps, min_eval_frequency=FLAGS.local_eval_frequency, - train_monitors=[]) + train_monitors=train_monitors, + eval_hooks=eval_hooks) def create_experiment_components(hparams, output_dir, data_dir, model_name): @@ -226,13 +249,16 @@ def create_hparams(params_id, data_dir): # Add hparams for the problems hparams.problems = [] + hparams.problem_instances = [] for problem_name in FLAGS.problems.split("-"): try: problem = registry.problem(problem_name) p_hparams = problem.internal_hparams(hparams) except ValueError: + problem = None p_hparams = problem_hparams.problem_hparams(problem_name, hparams) + hparams.problem_instances.append(problem) hparams.problems.append(p_hparams) return hparams @@ -301,9 +327,10 @@ def session_config(): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.worker_gpu_memory_fraction) - config = tf.ConfigProto(allow_soft_placement=True, - graph_options=graph_options, - gpu_options=gpu_options) + config = tf.ConfigProto( + allow_soft_placement=True, + graph_options=graph_options, + gpu_options=gpu_options) return config @@ -419,8 +446,12 @@ def model_fn(features, targets, mode): def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( - hparams, mode, hparams.problems[n], - n, dp, _ps_devices(all_workers=True)) + hparams, + mode, + hparams.problems[n], + n, + dp, + _ps_devices(all_workers=True)) if mode == tf.contrib.learn.ModeKeys.INFER: return model_class.infer( features, @@ -482,8 +513,8 @@ def nth_model(n): if mode == tf.contrib.learn.ModeKeys.EVAL: logits = tf.concat(sharded_logits, 0) if FLAGS.eval_print: - logits = tf.Print(logits, [features["inputs"], logits], - "EVAL PRINT", summarize=10000) + logits = tf.Print( + logits, [features["inputs"], logits], "EVAL PRINT", summarize=10000) # For evaluation, return the logits layer as our predictions. run_info["predictions"] = logits train_op = None @@ -541,19 +572,24 @@ def nth_model(n): # Define the train_op for the TRAIN mode. opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams) tf.logging.info("Computing gradients for global model_fn.") + opt_summaries = ["learning_rate", "loss"] + if hparams.summarize_grads: + opt_summaries.extend(["gradients", "gradient_norm"]) train_op = tf.contrib.layers.optimize_loss( name="training", loss=total_loss, global_step=tf.contrib.framework.get_global_step(), learning_rate=learning_rate, clip_gradients=hparams.clip_grad_norm or None, + gradient_noise_scale=hparams.grad_noise_scale or None, optimizer=opt, + summaries=opt_summaries, colocate_gradients_with_ops=True) # Remove summaries that will fail to run because they are in conditionals. # TODO(cwhipkey): Test with this code removed, later in 2017. summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES) - for i in range(len(summaries)-1, -1, -1): + for i in range(len(summaries) - 1, -1, -1): if summaries[i].name.startswith("cond_"): del summaries[i] @@ -569,18 +605,18 @@ def run_locally(exp): Args: exp: Experiment. """ - if exp.train_steps > 0: - # Train - tf.logging.info("Performing local training.") + if exp.train_steps > 0 or exp.eval_steps > 0: + tf.logging.info("Performing local training and evaluation.") exp.train_and_evaluate() + decode(exp.estimator) - # Predict - estimator = exp.estimator + +def decode(estimator): if FLAGS.decode_interactive: decode_interactively(estimator) elif FLAGS.decode_from_file is not None: decode_from_file(estimator, FLAGS.decode_from_file) - else: + elif FLAGS.decode_from_dataset: decode_from_dataset(estimator) @@ -599,8 +635,7 @@ def decode_from_dataset(estimator): data_file_patterns=infer_problems_data, num_datashards=data_parallelism().n, fixed_problem=i) - result_iter = estimator.predict( - input_fn=infer_input_fn, as_iterable=False) + result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False) def log_fn(inputs, targets, @@ -693,10 +728,14 @@ def log_fn(inputs, outputs): decodes.reverse() # Dumping inputs and outputs to file filename.decodes in # format result\tinput in the same order as original inputs + if FLAGS.decode_to_file: + output_filename = FLAGS.decode_to_file + else: + output_filename = filename if FLAGS.decode_shards > 1: - base_filename = filename + ("%.2d" % FLAGS.worker_id) + base_filename = output_filename + ("%.2d" % FLAGS.worker_id) else: - base_filename = filename + base_filename = output_filename decode_filename = (base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set + ".beam" + str(FLAGS.decode_beam_size) + ".alpha" + str(FLAGS.decode_alpha) + ".decodes") @@ -727,8 +766,8 @@ def decode_interactively(estimator): else: tf.logging.info(beam_string) else: - tf.logging.info(targets_vocab.decode(_save_until_eos( - result["outputs"].flatten()))) + tf.logging.info( + targets_vocab.decode(_save_until_eos(result["outputs"].flatten()))) def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, @@ -741,9 +780,12 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, tf.logging.info("Decoding batch %d" % b) batch_length = 0 batch_inputs = [] - for inputs in sorted_inputs[b * FLAGS.decode_batch_size: - (b + 1) * FLAGS.decode_batch_size]: + for inputs in sorted_inputs[b * FLAGS.decode_batch_size:( + b + 1) * FLAGS.decode_batch_size]: input_ids = vocabulary.encode(inputs) + if FLAGS.decode_max_input_size > 0: + # Subtract 1 for the EOS_ID. + input_ids = input_ids[:FLAGS.decode_max_input_size - 1] input_ids.append(text_encoder.EOS_ID) batch_inputs.append(input_ids) if len(input_ids) > batch_length: @@ -1037,12 +1079,13 @@ def input_fn(): for n in xrange(problem_count): if fixed_problem is not None and n != fixed_problem: continue + problem_instance = hparams.problem_instances[n] with tf.name_scope("problem_%d" % n): with tf.device("/cpu:0"): # Input queues are on CPU. capacity = hparams.problems[n].max_expected_batch_size_per_shard capacity *= num_datashards - examples = data_reader.input_pipeline(data_file_patterns[n], - capacity, mode) + examples = data_reader.input_pipeline( + problem_instance, data_file_patterns[n], capacity, mode) if mode == tf.contrib.learn.ModeKeys.TRAIN: drop_long_sequences = True else: @@ -1057,15 +1100,18 @@ def input_fn(): length_multiplier=batch_size_multiplier)) # Reverse inputs and targets features if the problem was reversed. - if hparams.problems[n].was_reversed: - inputs = feature_map["inputs"] - targets = feature_map["targets"] - feature_map["inputs"] = targets - feature_map["targets"] = inputs - - # Use the inputs as the targets if the problem is a copy problem. - if hparams.problems[n].was_copy: - feature_map["targets"] = feature_map["inputs"] + if problem_instance is not None: + problem_instance.maybe_reverse_features(feature_map) + problem_instance.maybe_copy_features(feature_map) + else: + if hparams.problems[n].was_reversed: + inputs = feature_map["inputs"] + targets = feature_map["targets"] + feature_map["inputs"] = targets + feature_map["targets"] = inputs + # Use the inputs as the targets if the problem is a copy problem. + if hparams.problems[n].was_copy: + feature_map["targets"] = feature_map["inputs"] # Ensure inputs and targets are proper rank. while len(feature_map["inputs"].get_shape()) != 4: @@ -1106,9 +1152,9 @@ def input_fn(): assert FLAGS.worker_replicas % problem_count == 0 problem_choice = tf.to_int32(FLAGS.worker_id % problem_count) else: - raise ValueError("Value of hparams.problem_choice is %s and must be " - "one of [uniform, adaptive, distributed]" % - hparams.problem_choice) + raise ValueError( + "Value of hparams.problem_choice is %s and must be " + "one of [uniform, adaptive, distributed]" % hparams.problem_choice) # Inputs and targets conditional on problem_choice. rand_inputs, rand_target, choice, inp_id, tgt_id = _cond_on_index( diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py index 3ed86952b..ea88183c9 100644 --- a/tensor2tensor/utils/trainer_utils_test.py +++ b/tensor2tensor/utils/trainer_utils_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/usr_dir.py b/tensor2tensor/utils/usr_dir.py index 0a2d0d15c..d89745b98 100644 --- a/tensor2tensor/utils/usr_dir.py +++ b/tensor2tensor/utils/usr_dir.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/yellowfin.py b/tensor2tensor/utils/yellowfin.py index 6bbe31bf6..aeb14e76e 100644 --- a/tensor2tensor/utils/yellowfin.py +++ b/tensor2tensor/utils/yellowfin.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensor2tensor/utils/yellowfin_test.py b/tensor2tensor/utils/yellowfin_test.py index c4727175b..2130be2b3 100644 --- a/tensor2tensor/utils/yellowfin_test.py +++ b/tensor2tensor/utils/yellowfin_test.py @@ -1,3 +1,4 @@ +# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License");