From 124b6dc96d31972133a2660b73b1791dd057c9b1 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 16 Jan 2025 11:00:25 +0000 Subject: [PATCH 1/3] Write Detailed Version To XML Write a version with commit hash instead of pip version to openvino_tokenizer.xml --- python/openvino_tokenizers/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py index ae4edba5..f5890726 100644 --- a/python/openvino_tokenizers/utils.py +++ b/python/openvino_tokenizers/utils.py @@ -8,10 +8,12 @@ from functools import lru_cache from typing import Any, Dict, Optional, Sequence, Tuple, Union +import openvino from openvino import Model, Type from openvino.preprocess import PrePostProcessor from openvino.runtime import opset12 as opset +from __version__ import __version__ as openvino_tokenizers_version from .constants import ( LOGITS_OUTPUT_NAME, ORIGINAL_TOKENIZER_CLASS_NAME, @@ -244,7 +246,11 @@ def update_rt_info_with_environment(ov_tokenizer: Model) -> None: :param ov_tokenizer: Thes OpenVINO tokenizer model to update. :type ov_tokenizer: openvino.Model """ - packages = ["openvino_tokenizers", "transformers", "tiktoken", "sentencepiece", "openvino", "tokenizers"] + ov_tokenizer.set_rt_info(openvino.get_version(), f"openvino_version") + ov_tokenizer.set_rt_info(openvino_tokenizers_version, f"openvino_tokenizers_version") + + packages = ["transformers", "tiktoken", "sentencepiece", "tokenizers"] + for name in packages: version = get_package_version(name) if version is not None: From 4a2624a5b19e915be120adadf0cd5756b024811e Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 16 Jan 2025 11:31:13 +0000 Subject: [PATCH 2/3] Write Detailed Version To XML Write a version with commit hash instead of pip version to openvino_tokenizer.xml --- python/openvino_tokenizers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py index f5890726..dbad1132 100644 --- a/python/openvino_tokenizers/utils.py +++ b/python/openvino_tokenizers/utils.py @@ -13,7 +13,7 @@ from openvino.preprocess import PrePostProcessor from openvino.runtime import opset12 as opset -from __version__ import __version__ as openvino_tokenizers_version +from .__version__ import __version__ as openvino_tokenizers_version from .constants import ( LOGITS_OUTPUT_NAME, ORIGINAL_TOKENIZER_CLASS_NAME, From 2194abd81b4fc0c73b2f1ffdf9170832e57c013e Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 16 Jan 2025 12:53:23 +0000 Subject: [PATCH 3/3] Fix Import, Lint, Formating --- python/openvino_tokenizers/__init__.py | 1 + python/openvino_tokenizers/hf_parser.py | 6 +++++- .../openvino_tokenizers/tokenizer_pipeline.py | 19 +++++++++++++---- python/openvino_tokenizers/utils.py | 21 +++++++++---------- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/python/openvino_tokenizers/__init__.py b/python/openvino_tokenizers/__init__.py index 5a9ee99f..a83b9e64 100644 --- a/python/openvino_tokenizers/__init__.py +++ b/python/openvino_tokenizers/__init__.py @@ -82,6 +82,7 @@ def inner(opset_version: Optional[str] = None) -> NodeFactory: return inner + def _get_opset_factory_callable() -> Callable[[], NodeFactory]: # factory without extensions factory = {} diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py index c3428034..cde59d4c 100644 --- a/python/openvino_tokenizers/hf_parser.py +++ b/python/openvino_tokenizers/hf_parser.py @@ -391,7 +391,11 @@ def add_padding(self, use_max_padding: bool = False) -> None: def decoding(self) -> None: skip_tokens = parse_special_tokens(self.original_tokenizer) - self.pipeline.add_steps(VocabDecoderStep.from_hf_json(self.tokenizer_json, self.pipeline.vocab, list(skip_tokens), do_skip_tokens=self.skip_special_tokens)) + self.pipeline.add_steps( + VocabDecoderStep.from_hf_json( + self.tokenizer_json, self.pipeline.vocab, list(skip_tokens), do_skip_tokens=self.skip_special_tokens + ) + ) has_decoder = self.tokenizer_json.get("decoder") is not None if has_decoder and self.tokenizer_json["decoder"]["type"] == "Sequence": diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index a93013dd..e2a760a0 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -14,13 +14,12 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np -from openvino.runtime import Model, Output, PartialShape, Shape, Type, op, Tensor +from openvino.runtime import Model, Output, PartialShape, Shape, Type, op from openvino.runtime import opset12 as opset from openvino.runtime.exceptions import OVTypeError, UserInputError from openvino.runtime.utils.types import as_node, make_constant_node from . import _get_factory, _get_opset_factory - from .constants import ( ATTENTION_MASK_INPUT_NAME, DETOKENIZER_NAME, @@ -32,7 +31,13 @@ VOCAB_SIZE_CACHE_PROPORTION, UTF8ReplaceMode, ) -from .utils import apply_unicode_to_bytes, generate_tokens_with_space_symbols, has_incompatible_re2_op, quote_meta, create_unpacked_string +from .utils import ( + apply_unicode_to_bytes, + create_unpacked_string, + generate_tokens_with_space_symbols, + has_incompatible_re2_op, + quote_meta, +) logger = logging.getLogger(__name__) @@ -1035,7 +1040,13 @@ def finalize(self) -> None: self.skip_tokens = pipeline.skip_tokens or [] @classmethod - def from_hf_json(cls, tokenizer_json: Dict[str, Any], pipeline_vocab: Optional[List[str]], skip_tokens: Optional[List[int]] = None, do_skip_tokens: bool = True) -> "VocabDecoderStep": + def from_hf_json( + cls, + tokenizer_json: Dict[str, Any], + pipeline_vocab: Optional[List[str]], + skip_tokens: Optional[List[int]] = None, + do_skip_tokens: bool = True, + ) -> "VocabDecoderStep": model_type = tokenizer_json["model"]["type"] if pipeline_vocab is not None and model_type == "WordLevel": diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py index a35c502e..1d300a83 100644 --- a/python/openvino_tokenizers/utils.py +++ b/python/openvino_tokenizers/utils.py @@ -6,18 +6,16 @@ import re from dataclasses import dataclass, field, fields from functools import lru_cache -from typing import Any, Dict, Optional, Sequence, Tuple, Union, Iterable, List -import numpy as np -from numpy.typing import NDArray from io import BytesIO +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union - +import numpy as np import openvino -from openvino import Model, Type +from openvino import Model, Tensor, Type from openvino.preprocess import PrePostProcessor -from openvino.runtime import opset12 as opset, Output -from openvino.op import Constant -from openvino import Tensor +from openvino.runtime import Output +from openvino.runtime import opset12 as opset +from openvino.runtime.op import Constant from .__version__ import __version__ as openvino_tokenizers_version from .constants import ( @@ -29,6 +27,7 @@ rt_info_to_hf_attribute_map, ) + @dataclass class TokenzierConversionParams: """ @@ -251,8 +250,8 @@ def update_rt_info_with_environment(ov_tokenizer: Model) -> None: :param ov_tokenizer: Thes OpenVINO tokenizer model to update. :type ov_tokenizer: openvino.Model """ - ov_tokenizer.set_rt_info(openvino.get_version(), f"openvino_version") - ov_tokenizer.set_rt_info(openvino_tokenizers_version, f"openvino_tokenizers_version") + ov_tokenizer.set_rt_info(openvino.get_version(), "openvino_version") + ov_tokenizer.set_rt_info(openvino_tokenizers_version, "openvino_tokenizers_version") packages = ["transformers", "tiktoken", "sentencepiece", "tokenizers"] @@ -333,5 +332,5 @@ def create_unpacked_string(strings: Iterable[str]) -> List[Output]: begins = np.frombuffer(begins.getvalue(), np.int32) ends = np.frombuffer(ends.getvalue(), np.int32) chars = np.frombuffer(chars.getvalue(), np.uint8) - + return [Constant(Tensor(x)).output(0) for x in [begins, ends, chars]]