From 8be756450de0f460d0f58f7962942fab9aaca318 Mon Sep 17 00:00:00 2001 From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com> Date: Sun, 19 Jan 2025 00:24:34 +0800 Subject: [PATCH] Remove pad_with_end_token argument. --- .../src/models/clip/clip_preprocessor_test.py | 4 ++-- keras_hub/src/models/clip/clip_tokenizer.py | 22 +++++++++---------- .../src/models/clip/clip_tokenizer_test.py | 6 ----- ...usion_3_text_to_image_preprocessor_test.py | 2 +- 4 files changed, 13 insertions(+), 21 deletions(-) diff --git a/keras_hub/src/models/clip/clip_preprocessor_test.py b/keras_hub/src/models/clip/clip_preprocessor_test.py index e7ab87a156..c14c1beaa6 100644 --- a/keras_hub/src/models/clip/clip_preprocessor_test.py +++ b/keras_hub/src/models/clip/clip_preprocessor_test.py @@ -25,7 +25,7 @@ def test_preprocessor_basics(self): init_kwargs=self.init_kwargs, input_data=self.input_data, expected_output={ - "token_ids": [[5, 1, 2, 1, 3, 4, 0, 0]], + "token_ids": [[5, 1, 2, 1, 3, 4, 4, 4]], "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]], }, ) @@ -39,7 +39,7 @@ def test_no_start_end_token(self): add_end_token=False, ) x = preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[1, 2, 1, 3, 0, 0, 0, 0]] * 4) + self.assertAllEqual(x["token_ids"], [[1, 2, 1, 3, 4, 4, 4, 4]] * 4) self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 0, 0, 0, 0]] * 4) def test_sequence_length_override(self): diff --git a/keras_hub/src/models/clip/clip_tokenizer.py b/keras_hub/src/models/clip/clip_tokenizer.py index 44e8832996..2914771d57 100644 --- a/keras_hub/src/models/clip/clip_tokenizer.py +++ b/keras_hub/src/models/clip/clip_tokenizer.py @@ -39,7 +39,6 @@ class CLIPTokenizer(BytePairTokenizer): it should be the file path to merge rules. The merge rule file should have one merge rule per line. Every merge rule contains merge entities separated by a space. - pad_with_end_token: bool. Whether to pad the output with `end_token`. Examples: @@ -64,13 +63,17 @@ def __init__( self, vocabulary=None, merges=None, - pad_with_end_token=False, **kwargs, ): self._add_special_token("<|startoftext|>", "start_token") self._add_special_token("<|endoftext|>", "end_token") - self.pad_token_id = 0 - self.pad_with_end_token = pad_with_end_token + self._add_special_token("<|endoftext|>", "pad_token") + + # TODO: Remove this in the future. + # To maintain backward compatibility, we need to remove + # `"pad_with_end_token"` arg. + if "pad_with_end_token" in kwargs: + kwargs.pop("pad_with_end_token") super().__init__( vocabulary=vocabulary, @@ -81,8 +84,6 @@ def __init__( def set_vocabulary_and_merges(self, vocabulary, merges): super().set_vocabulary_and_merges(vocabulary, merges) - if self.pad_with_end_token: - self.pad_token_id = self.end_token_id def _bpe_merge_and_update_cache(self, tokens): """Process unseen tokens and add to cache.""" @@ -161,7 +162,9 @@ def process_unseen_tokens(): if self.sequence_length: output_shape = tokens.shape.as_list() output_shape[-1] = self.sequence_length - tokens = tokens.to_tensor(shape=output_shape) + tokens = tokens.to_tensor( + default_value=self.pad_token_id, shape=output_shape + ) # Convert to a dense output if input in scalar if unbatched: @@ -194,11 +197,6 @@ def detokenize(self, inputs): def get_config(self): config = super().get_config() - config.update( - { - "pad_with_end_token": self.pad_with_end_token, - } - ) # In the constructor, we pass the list of special tokens to the # `unsplittable_tokens` arg of the superclass' constructor. Hence, we # delete it from the config here. diff --git a/keras_hub/src/models/clip/clip_tokenizer_test.py b/keras_hub/src/models/clip/clip_tokenizer_test.py index 5c1e213a74..4a3a21718c 100644 --- a/keras_hub/src/models/clip/clip_tokenizer_test.py +++ b/keras_hub/src/models/clip/clip_tokenizer_test.py @@ -25,12 +25,6 @@ def test_tokenizer_basics(self): expected_detokenize_output=["airplane", "airport"], ) - def test_pad_with_end_token(self): - init_kwargs = self.init_kwargs.copy() - init_kwargs["pad_with_end_token"] = True - tokenizer = CLIPTokenizer(**init_kwargs) - self.assertEqual(tokenizer.pad_token_id, tokenizer.end_token_id) - def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): CLIPTokenizer(vocabulary={"foo": 0, "bar": 1}, merges=["fo o"]) diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor_test.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor_test.py index 46d69be381..e4d843bf57 100644 --- a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor_test.py +++ b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor_test.py @@ -57,4 +57,4 @@ def test_generate_preprocess(self): self.assertIn("clip_l", x) self.assertIn("clip_g", x) self.assertAllEqual(x["clip_l"][0], [4, 0, 1, 3, 3, 3, 3, 3]) - self.assertAllEqual(x["clip_g"][0], [4, 0, 1, 3, 0, 0, 0, 0]) + self.assertAllEqual(x["clip_g"][0], [4, 0, 1, 3, 3, 3, 3, 3])