From 1dd6b667a83a6c7e7c88f02cdbd595d1f9ac028c Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 26 Nov 2024 15:01:54 +0000 Subject: [PATCH 01/30] Add unicode normalization layer tests --- tests/layer_tests.py | 74 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/tests/layer_tests.py b/tests/layer_tests.py index 17ed84f4..5ea8640e 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -2,10 +2,11 @@ import re import tempfile from pathlib import Path -from typing import Union +from typing import Union, NamedTuple import openvino as ov import pytest +import requests from openvino import Model, PartialShape, Type from openvino.runtime import op from openvino_tokenizers import _get_factory @@ -19,12 +20,52 @@ RegexSplitStep, TokenizerPipeline, UTF8ValidateStep, + NormalizeUnicode, ) from tests.utils import get_hf_tokenizer core = ov.Core() +UNICODE_TEST_FILE_URL = "https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt" + + +class NormalizationTestLine(NamedTuple): + source: str + nfc: str + nfd: str + nfkc: str + nfkd: str + comment: str + +def parse_normalization_test_line(line): + parts, comment = line.split("#", 1) + parts = [part.strip() for part in parts.split(";")] + + # Convert the hexadecimal Unicode code points to characters + def hex_to_char(hex_str): + return "".join(chr(int(code, 16)) for code in hex_str.split()) + + # Parse the components + source = hex_to_char(parts[0]) + nfc = hex_to_char(parts[1]) + nfd = hex_to_char(parts[2]) + nfkc = hex_to_char(parts[3]) + nfkd = hex_to_char(parts[4]) + + return NormalizationTestLine(source, nfc, nfd, nfkc, nfkd, comment) + + +@pytest.fixture(scope="session") +def unicode_normalization_test_data(request): + # check https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt for details + test_file = requests.get(UNICODE_TEST_FILE_URL).text + return [ + parse_normalization_test_line(line) + for line in test_file.split("\n") + if line and not line.startswith("#") and not line.startswith("@") + ] + ############################################ ########## Test Normalizer Step ############ @@ -115,6 +156,31 @@ def precompiled_charsmap_json(request, hf_charsmap_tokenizer): return tj["normalizer"]["normalizers"][0] +@pytest.mark.parametrize( + "test_parameters", + [ + ("NFC", 19875, 90), + ("NFD", 19851, 114), + ("NFKC", 19777, 188), + ("NFKD", 19753, 212), + ] +) +def test_unicode_normalization_model(test_parameters, unicode_normalization_test_data): + normalization_type, positive_threshold, negative_threshold = test_parameters + nfc_normalizer_layer = NormalizeUnicode(normalization_type) + compiled_model = create_normalization_model(nfc_normalizer_layer) + negative = 0 + positive = 0 + for test_input in unicode_normalization_test_data: + res_ov = compiled_model([test_input.source])[0][0].encode() + expected = getattr(test_input, normalization_type.lower()).encode() + positive += res_ov == expected + negative += res_ov != expected + + assert positive == positive_threshold + assert negative == negative_threshold + + @pytest.mark.parametrize("test_string", charsmap_test_strings) def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json): charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json) @@ -140,7 +206,7 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled RegexNormalizationStep( regex_search_pattern=r" ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)", replace_term=r"\1", - ) + ), ), ("", "", RegexNormalizationStep.prepend_regex("▁")), ("\n", "▁\n", RegexNormalizationStep.prepend_regex("▁")), @@ -152,9 +218,9 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled RegexNormalizationStep( regex_search_pattern=r"(^)(.)", replace_term=r"▁\2", - ) + ), ), - ] + ], ) def test_regex_normalization(test_string, expected, layer): compiled_model = create_normalization_model(layer) From afd4a60e8dd4858c72913d684e2525d978a27f38 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 29 Nov 2024 10:47:56 +0000 Subject: [PATCH 02/30] WiP --- .../openvino_tokenizers/tokenizer_pipeline.py | 19 ++++++++- src/CMakeLists.txt | 4 +- src/charsmap_normalization.cpp | 42 +++++++++++++------ src/charsmap_normalization.hpp | 27 +++++++++++- tests/layer_tests.py | 19 ++++----- 5 files changed, 84 insertions(+), 27 deletions(-) diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index 734a04d5..76102d1a 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -155,7 +155,7 @@ class NormalizationStep(BasePipelineStep): @dataclass -class NormalizeUnicode(NormalizationStep): +class _NormalizeUnicode(NormalizationStep): normalization_form: str = "NFD" def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: @@ -168,6 +168,23 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: ) .outputs() ) + pass + + +@dataclass +class NormalizeUnicode(NormalizationStep): + normalization_form: str = "NFD" + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return ( + _get_factory() + .create( + "CharsMapNormalization", + input_nodes, + {"normalization_form": self.normalization_form.lower()}, + ) + .outputs() + ) @dataclass diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2caea5f4..11b11584 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -150,8 +150,8 @@ if(sentencepiece_FOUND) else() FetchContent_Declare( sentencepiece - URL https://github.com/google/sentencepiece/archive/refs/tags/v0.2.0.tar.gz - URL_HASH SHA256=9970f0a0afee1648890293321665e5b2efa04eaec9f1671fcf8048f456f5bb86 + URL https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz + URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886 ) FetchContent_GetProperties(sentencepiece) if(NOT sentencepiece_POPULATED) diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp index d5ff9739..64cdc34b 100644 --- a/src/charsmap_normalization.cpp +++ b/src/charsmap_normalization.cpp @@ -11,8 +11,8 @@ using namespace ov; namespace { -std::shared_ptr make_identity_spec() { - auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec("identity"); +std::shared_ptr make_normalization_spec(const std::string& normalization_form) { + auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec(normalization_form); return std::make_shared(spec); } @@ -21,35 +21,53 @@ std::shared_ptr make_identity_spec() { void CharsMapNormalization::validate_and_infer_types() { auto input_size = get_input_size(); - OPENVINO_ASSERT(input_size == 4 || input_size == 5, "supported input sizes are 4 or 5"); - - const bool has_skips = (input_size == 5); + bool has_skips; + if (m_normalization_form == "") { + OPENVINO_ASSERT(input_size == 4 || input_size == 5, "supported input sizes are 4 or 5 with input spec"); + has_skips = (input_size == 5); + OPENVINO_ASSERT(get_input_element_type(3 + has_skips) == element::u8, "Charsmap normalizer accepts precompiled mapping and it should be of type u8 tensor"); + } else { + OPENVINO_ASSERT(input_size == 3 || input_size == 4, "supported input sizes are 3 or 4 without input spec"); + has_skips = (input_size == 4); + } check_string_input(this, 0); - OPENVINO_ASSERT(get_input_element_type(3 + has_skips) == element::u8, "Charsmap normalizer accepts precompiled mapping and it should be of type u8 tensor"); set_string_output(this, 0, get_input_partial_shape(0)); if (has_skips) { this->set_output_type(3, get_input_element_type(3), get_input_partial_shape(3)); }; + std::cerr << "CharsMapNormalization validation done" << std::endl; } bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - const bool has_skips = (inputs.size() == 5); - { - // Write to common trie structures should be protected to prevent race conditions. + std::cerr << "CharsMapNormalization evaluate" << std::endl; + const bool has_skips = (inputs.size() == 5) || (m_normalization_form != "" && inputs.size() == 4); + std::cerr << "has_skips: " << has_skips << std::endl; + { std::lock_guard lock(m_mutex); if (m_normalizer == nullptr) { - const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); - m_spec = make_identity_spec(); + std::cerr << "CharsMapNormalization creating normalizer" << std::endl; + auto normalization_form = m_normalization_form == "" ? "identity" : m_normalization_form; + + std::cerr << "normalization_form: " << normalization_form << std::endl; + + m_spec = make_normalization_spec(normalization_form); m_spec->set_add_dummy_prefix(m_add_dummy_prefix); m_spec->set_escape_whitespaces(m_escape_whitespaces); - m_spec->set_precompiled_charsmap(precompiled_charsmap); + + if (m_normalization_form == "") { + std::cerr << "CharsMapNormalization setting precompiled_charsmap" << std::endl; + const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); + m_spec->set_precompiled_charsmap(precompiled_charsmap); + }; + m_normalizer = std::make_shared(*m_spec); } } + std::cerr << "CharsMapNormalization evaluating normalization" << std::endl; return evaluate_normalization_helper( outputs, inputs, diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp index ef99d9c1..8ef869d0 100644 --- a/src/charsmap_normalization.hpp +++ b/src/charsmap_normalization.hpp @@ -27,18 +27,40 @@ class CharsMapNormalization : public ov::op::Op { const std::shared_ptr normalizer, const std::shared_ptr spec ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec) { + std::cerr << "CharsMapNormalization constructor" << std::endl; + constructor_validate_and_infer_types(); + } + CharsMapNormalization( + const ov::OutputVector& arguments, + const std::shared_ptr normalizer, + const std::shared_ptr spec, + bool add_dummy_prefix = false, + bool escape_whitespaces = false, + const std::string& normalization_form = "" + ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_normalization_form(normalization_form) { + std::cerr << "CharsMapNormalization constructor2" << std::endl; + constructor_validate_and_infer_types(); + } + CharsMapNormalization( + const ov::OutputVector& arguments, + const std::shared_ptr normalizer, + const std::shared_ptr spec, + const std::string& normalization_form = "" + ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_normalization_form(normalization_form) { + std::cerr << "CharsMapNormalization constructor3" << std::endl; constructor_validate_and_infer_types(); } void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_normalizer, m_spec); + return std::make_shared(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_normalization_form); } bool visit_attributes(ov::AttributeVisitor& visitor) override { visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix); visitor.on_attribute("escape_whitespaces", m_escape_whitespaces); + visitor.on_attribute("normalization_form", m_normalization_form); return true; } @@ -52,8 +74,9 @@ class CharsMapNormalization : public ov::op::Op { bool m_add_dummy_prefix = false; bool m_escape_whitespaces = false; + std::string m_normalization_form = ""; // spec should be preserved for the lifetime of the normalizer mutable std::shared_ptr m_spec; mutable std::mutex m_mutex; -}; \ No newline at end of file +}; diff --git a/tests/layer_tests.py b/tests/layer_tests.py index 5ea8640e..4765c849 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -74,7 +74,6 @@ def unicode_normalization_test_data(request): utf8_validate_strings = [ # Valid sequences. b"Eng... test, string?!", - b"Eng... test, string?!", b"\xe2\x82\xac", # Euro sign €ß "Проверка, как работает кириллица Љ љ Ђ ђ".encode(), "測試字符串".encode(), @@ -156,6 +155,15 @@ def precompiled_charsmap_json(request, hf_charsmap_tokenizer): return tj["normalizer"]["normalizers"][0] +@pytest.mark.parametrize("test_string", charsmap_test_strings) +def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json): + charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json) + compiled_model = create_normalization_model(charsmap_normalization_node) + res_ov = compiled_model([test_string])[0][0] + res_hf = hf_charsmap_tokenizer.backend_tokenizer.normalizer.normalize_str(test_string) + assert res_ov == res_hf + + @pytest.mark.parametrize( "test_parameters", [ @@ -181,15 +189,6 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test assert negative == negative_threshold -@pytest.mark.parametrize("test_string", charsmap_test_strings) -def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json): - charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json) - compiled_model = create_normalization_model(charsmap_normalization_node) - res_ov = compiled_model([test_string])[0][0] - res_hf = hf_charsmap_tokenizer.backend_tokenizer.normalizer.normalize_str(test_string) - assert res_ov == res_hf - - @pytest.mark.parametrize( "test_string, expected, layer", [ From 2f24fec72c1a7b9741bd7a9f6fa718962971c4d8 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 20 Dec 2024 13:21:30 +0000 Subject: [PATCH 03/30] WiP --- src/CMakeLists.txt | 1 + src/charsmap_normalization.cpp | 50 ++++++++++++++++++---------------- src/charsmap_normalization.hpp | 15 ++++++---- tests/layer_tests.py | 49 +++++++++++++++++++++++---------- 4 files changed, 71 insertions(+), 44 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 11b11584..97aae478 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -174,6 +174,7 @@ else() set(SPM_ENABLE_SHARED OFF CACHE BOOL "") set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "") + set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile") FetchContent_Populate(sentencepiece) add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL) endif() diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp index 64cdc34b..1a6256e8 100644 --- a/src/charsmap_normalization.cpp +++ b/src/charsmap_normalization.cpp @@ -4,20 +4,11 @@ #include "charsmap_normalization.hpp" #include "utils.hpp" -#include "sentencepiece_trainer.h" // for making normalizer spec +#include "builder.h" // for making normalizer spec #include "absl/strings/str_format.h" using namespace ov; -namespace { - -std::shared_ptr make_normalization_spec(const std::string& normalization_form) { - auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec(normalization_form); - return std::make_shared(spec); -} - -} // namespace - void CharsMapNormalization::validate_and_infer_types() { auto input_size = get_input_size(); @@ -37,37 +28,50 @@ void CharsMapNormalization::validate_and_infer_types() { if (has_skips) { this->set_output_type(3, get_input_element_type(3), get_input_partial_shape(3)); }; - std::cerr << "CharsMapNormalization validation done" << std::endl; } bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - std::cerr << "CharsMapNormalization evaluate" << std::endl; const bool has_skips = (inputs.size() == 5) || (m_normalization_form != "" && inputs.size() == 4); - std::cerr << "has_skips: " << has_skips << std::endl; { std::lock_guard lock(m_mutex); if (m_normalizer == nullptr) { - std::cerr << "CharsMapNormalization creating normalizer" << std::endl; - auto normalization_form = m_normalization_form == "" ? "identity" : m_normalization_form; - - std::cerr << "normalization_form: " << normalization_form << std::endl; - - m_spec = make_normalization_spec(normalization_form); + m_spec = std::make_shared(); m_spec->set_add_dummy_prefix(m_add_dummy_prefix); m_spec->set_escape_whitespaces(m_escape_whitespaces); + std::string precompiled_charsmap; if (m_normalization_form == "") { - std::cerr << "CharsMapNormalization setting precompiled_charsmap" << std::endl; - const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); - m_spec->set_precompiled_charsmap(precompiled_charsmap); + precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); + } else if (m_normalization_form == "nfc") { + sentencepiece::normalizer::Builder::CharsMap chars_map; + sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map); + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + } else if (m_normalization_form == "nfd") { + sentencepiece::normalizer::Builder::CharsMap chars_map; + sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map); + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + } else if (m_normalization_form == "nfkc") { + sentencepiece::normalizer::Builder::CharsMap chars_map; + sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map); + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + } else if (m_normalization_form == "nfkd") { + sentencepiece::normalizer::Builder::CharsMap chars_map; + sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map); + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + } else { + OPENVINO_ASSERT(false, "Unsupported normalization form: " + m_normalization_form); }; + std::cerr << "CharsMapNormalization: precompiled_charsmap.size() = " << precompiled_charsmap.size() << std::endl; + std::cerr << "CharsMapNormalization: precompiled_charsmap first 100 chars = " << precompiled_charsmap.substr(0, 100) << std::endl; + + m_spec->set_precompiled_charsmap(precompiled_charsmap); + m_normalizer = std::make_shared(*m_spec); } } - std::cerr << "CharsMapNormalization evaluating normalization" << std::endl; return evaluate_normalization_helper( outputs, inputs, diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp index 8ef869d0..43ab09d3 100644 --- a/src/charsmap_normalization.hpp +++ b/src/charsmap_normalization.hpp @@ -27,7 +27,6 @@ class CharsMapNormalization : public ov::op::Op { const std::shared_ptr normalizer, const std::shared_ptr spec ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec) { - std::cerr << "CharsMapNormalization constructor" << std::endl; constructor_validate_and_infer_types(); } CharsMapNormalization( @@ -36,9 +35,10 @@ class CharsMapNormalization : public ov::op::Op { const std::shared_ptr spec, bool add_dummy_prefix = false, bool escape_whitespaces = false, - const std::string& normalization_form = "" - ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_normalization_form(normalization_form) { - std::cerr << "CharsMapNormalization constructor2" << std::endl; + bool case_fold = false, + const std::string& normalization_form = "", + bool nmt = false + ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){ constructor_validate_and_infer_types(); } CharsMapNormalization( @@ -47,20 +47,21 @@ class CharsMapNormalization : public ov::op::Op { const std::shared_ptr spec, const std::string& normalization_form = "" ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_normalization_form(normalization_form) { - std::cerr << "CharsMapNormalization constructor3" << std::endl; constructor_validate_and_infer_types(); } void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_normalization_form); + return std::make_shared(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt); } bool visit_attributes(ov::AttributeVisitor& visitor) override { visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix); visitor.on_attribute("escape_whitespaces", m_escape_whitespaces); visitor.on_attribute("normalization_form", m_normalization_form); + visitor.on_attribute("case_fold", m_case_fold); + visitor.on_attribute("nmt", m_nmt); return true; } @@ -74,6 +75,8 @@ class CharsMapNormalization : public ov::op::Op { bool m_add_dummy_prefix = false; bool m_escape_whitespaces = false; + bool m_case_fold = false; + bool m_nmt = false; std::string m_normalization_form = ""; // spec should be preserved for the lifetime of the normalizer diff --git a/tests/layer_tests.py b/tests/layer_tests.py index 4765c849..9a271bc6 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -57,12 +57,16 @@ def hex_to_char(hex_str): @pytest.fixture(scope="session") -def unicode_normalization_test_data(request): +def icu_test_data(request): + return requests.get(UNICODE_TEST_FILE_URL).text + + +@pytest.fixture(scope="session") +def unicode_normalization_test_data(request, icu_test_data): # check https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt for details - test_file = requests.get(UNICODE_TEST_FILE_URL).text return [ parse_normalization_test_line(line) - for line in test_file.split("\n") + for line in icu_test_data.split("\n") if line and not line.startswith("#") and not line.startswith("@") ] @@ -167,26 +171,41 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled @pytest.mark.parametrize( "test_parameters", [ - ("NFC", 19875, 90), - ("NFD", 19851, 114), - ("NFKC", 19777, 188), - ("NFKD", 19753, 212), + # results for sentencepiece charsmap: + ("NFC", 17325), # failed examples: 2640 + ("NFD", 17736), # failed examples: 2229 + ("NFKC", 17159), # failed examples: 2806 + ("NFKD", 17554), # failed examples: 2411 + # results for icu70: + # ("NFC", 19875), # failed examples: 90 + # ("NFD", 19851), # failed examples: 114 + # ("NFKC", 19777), # failed examples: 188 + # ("NFKD", 19753), # failed examples: 212 + # results for huggingface tokenizers: + # ("NFC", 19247), # failed examples: 718 + # ("NFD", 19220), # failed examples: 745 + # ("NFKC", 19077), # failed examples: 888 + # ("NFKD", 19050), # failed examples: 915 ] ) def test_unicode_normalization_model(test_parameters, unicode_normalization_test_data): - normalization_type, positive_threshold, negative_threshold = test_parameters - nfc_normalizer_layer = NormalizeUnicode(normalization_type) - compiled_model = create_normalization_model(nfc_normalizer_layer) - negative = 0 - positive = 0 + normalization_type, positive_threshold = test_parameters + normalizer_layer = NormalizeUnicode(normalization_type) + compiled_model = create_normalization_model(normalizer_layer) + positive, negative, no_transformation = 0, 0, 0 for test_input in unicode_normalization_test_data: res_ov = compiled_model([test_input.source])[0][0].encode() expected = getattr(test_input, normalization_type.lower()).encode() positive += res_ov == expected negative += res_ov != expected - - assert positive == positive_threshold - assert negative == negative_threshold + no_transformation += test_input.source.encode() == expected + + assert positive == positive_threshold, ( + f"{normalization_type}\n" + f"Positive: {positive}, expected: {positive_threshold}\n" + f"Negative: {negative}, expected: {len(unicode_normalization_test_data) - positive_threshold}\n" + f"No transformation: {no_transformation}, positive delta: {positive - no_transformation}" + ) @pytest.mark.parametrize( From 08052c2b20b32d20dcd6865713f77f4549d5236d Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 8 Jan 2025 20:03:15 +0000 Subject: [PATCH 04/30] Switch Casefold and UnicodeNormalization to CharsMap --- .../openvino_tokenizers/tokenizer_pipeline.py | 66 +++++++++++++------ src/charsmap_normalization.cpp | 26 ++++---- src/charsmap_normalization.hpp | 7 +- tests/layer_tests.py | 19 ++++++ 4 files changed, 83 insertions(+), 35 deletions(-) diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index 76102d1a..c09ce09c 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -155,25 +155,15 @@ class NormalizationStep(BasePipelineStep): @dataclass -class _NormalizeUnicode(NormalizationStep): +class NormalizeUnicode(NormalizationStep): normalization_form: str = "NFD" - def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: - return ( - _get_factory() - .create( - "NormalizeUnicode", - input_nodes, - {"normalization_form": self.normalization_form}, + def __post_init__(self): + if self.normalization_form not in ["NFD", "NFC", "NFKD", "NFKC"]: + raise ValueError( + 'NormalizeUnicode`normalization_form` attribute must be one of ["NFD", "NFC", "NFKD", "NFKC"], ' + f'got {self.normalization_form}.' ) - .outputs() - ) - pass - - -@dataclass -class NormalizeUnicode(NormalizationStep): - normalization_form: str = "NFD" def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: return ( @@ -181,7 +171,10 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: .create( "CharsMapNormalization", input_nodes, - {"normalization_form": self.normalization_form.lower()}, + { + "normalization_form": self.normalization_form.lower(), + "remove_extra_whitespaces": False, + }, ) .outputs() ) @@ -199,7 +192,19 @@ def __post_init__(self): ) def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: - return _get_factory().create("CaseFold", input_nodes, {"encoding": self.encoding}).outputs() + return ( + _get_factory() + .create( + "CharsMapNormalization", + input_nodes, + { + "normalization_form": "identity", + "case_fold": True, + "remove_extra_whitespaces": False, + }, + ) + .outputs() + ) @dataclass @@ -262,7 +267,17 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: @dataclass class CharsmapStep(NormalizationStep): - charsmap: bytes + charsmap: Optional[bytes] = None + normalization_form: Optional[str] = None + add_dummy_prefix: bool = False + remove_extra_whitespaces: bool = True + escape_whitespaces: bool = False + case_fold: bool = False + nmt: bool = False + + def __post_init__(self): + if self.charsmap is None and self.normalization_form is None: + raise ValueError("[ CharsmapStep ] `charsmap` or `normalization_form` attribute must be set") @classmethod def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep": @@ -270,7 +285,18 @@ def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep": def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes += make_constant_node(np.frombuffer(self.charsmap, dtype=np.uint8), dtype=Type.u8).outputs() - return _get_factory().create("CharsMapNormalization", input_nodes).outputs() + return _get_factory().create( + "CharsMapNormalization", + input_nodes, + { + "normalization_form": self.normalization_form or "", + "add_dummy_prefix": self.add_dummy_prefix, + "remove_extra_whitespaces": self.remove_extra_whitespaces, + "escape_whitespaces": self.escape_whitespaces, + "case_fold": self.case_fold, + "nmt": self.nmt, + } + ).outputs() @dataclass diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp index 1a6256e8..5178801a 100644 --- a/src/charsmap_normalization.cpp +++ b/src/charsmap_normalization.cpp @@ -38,34 +38,34 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor if (m_normalizer == nullptr) { m_spec = std::make_shared(); m_spec->set_add_dummy_prefix(m_add_dummy_prefix); + m_spec->set_remove_extra_whitespaces(m_remove_extra_whitespaces); m_spec->set_escape_whitespaces(m_escape_whitespaces); - std::string precompiled_charsmap; - if (m_normalization_form == "") { - precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); + sentencepiece::normalizer::Builder::CharsMap chars_map; + if (m_normalization_form == "identity") { + // no need to modify chars_map } else if (m_normalization_form == "nfc") { - sentencepiece::normalizer::Builder::CharsMap chars_map; sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map); - sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); } else if (m_normalization_form == "nfd") { - sentencepiece::normalizer::Builder::CharsMap chars_map; sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map); - sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); } else if (m_normalization_form == "nfkc") { - sentencepiece::normalizer::Builder::CharsMap chars_map; sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map); - sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); } else if (m_normalization_form == "nfkd") { - sentencepiece::normalizer::Builder::CharsMap chars_map; sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map); - sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); } else { OPENVINO_ASSERT(false, "Unsupported normalization form: " + m_normalization_form); }; - std::cerr << "CharsMapNormalization: precompiled_charsmap.size() = " << precompiled_charsmap.size() << std::endl; - std::cerr << "CharsMapNormalization: precompiled_charsmap first 100 chars = " << precompiled_charsmap.substr(0, 100) << std::endl; + if (m_case_fold) { + sentencepiece::normalizer::Builder::MergeUnicodeCaseFoldMap(&chars_map); + }; + std::string precompiled_charsmap; + if (m_normalization_form == "") { + precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); + } else { + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + } m_spec->set_precompiled_charsmap(precompiled_charsmap); m_normalizer = std::make_shared(*m_spec); diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp index 43ab09d3..a6179b63 100644 --- a/src/charsmap_normalization.hpp +++ b/src/charsmap_normalization.hpp @@ -34,11 +34,12 @@ class CharsMapNormalization : public ov::op::Op { const std::shared_ptr normalizer, const std::shared_ptr spec, bool add_dummy_prefix = false, + bool remove_extra_whitespaces = false, bool escape_whitespaces = false, bool case_fold = false, const std::string& normalization_form = "", bool nmt = false - ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){ + ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_remove_extra_whitespaces(remove_extra_whitespaces), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){ constructor_validate_and_infer_types(); } CharsMapNormalization( @@ -53,11 +54,12 @@ class CharsMapNormalization : public ov::op::Op { void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt); + return std::make_shared(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_remove_extra_whitespaces, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt); } bool visit_attributes(ov::AttributeVisitor& visitor) override { visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix); + visitor.on_attribute("remove_extra_whitespaces", m_remove_extra_whitespaces); visitor.on_attribute("escape_whitespaces", m_escape_whitespaces); visitor.on_attribute("normalization_form", m_normalization_form); visitor.on_attribute("case_fold", m_case_fold); @@ -74,6 +76,7 @@ class CharsMapNormalization : public ov::op::Op { mutable std::shared_ptr m_normalizer; bool m_add_dummy_prefix = false; + bool m_remove_extra_whitespaces = true; bool m_escape_whitespaces = false; bool m_case_fold = false; bool m_nmt = false; diff --git a/tests/layer_tests.py b/tests/layer_tests.py index 9a271bc6..7637b585 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -12,6 +12,7 @@ from openvino_tokenizers import _get_factory from openvino_tokenizers.constants import UTF8ReplaceMode from openvino_tokenizers.tokenizer_pipeline import ( + CaseFoldStep, CharsmapStep, DecodingStep, NormalizationStep, @@ -208,6 +209,24 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test ) + +@pytest.mark.parametrize( + "test_string, expected", + [ + ("a", "a"), + ("A", "a"), + ("Ю", "ю"), + ("Σ", "σ"), + ("Hello World!", "hello world!"), + ] +) +def test_casefold_normalization(test_string, expected): + casefold = CaseFoldStep() + compiled_model = create_normalization_model(casefold) + res_ov = compiled_model([test_string])[0] + assert res_ov == expected + + @pytest.mark.parametrize( "test_string, expected, layer", [ From f6c001b1c2d524394bf74c1233fd9b37cd046da7 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 26 Nov 2024 15:01:54 +0000 Subject: [PATCH 05/30] Add unicode normalization layer tests --- tests/layer_tests.py | 72 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/tests/layer_tests.py b/tests/layer_tests.py index f46b1be9..ddc5be85 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -2,10 +2,11 @@ import re import tempfile from pathlib import Path -from typing import Union +from typing import Union, NamedTuple import openvino as ov import pytest +import requests from openvino import Model, PartialShape, Type from openvino.runtime import op from openvino_tokenizers import _get_factory @@ -19,12 +20,52 @@ RegexSplitStep, TokenizerPipeline, UTF8ValidateStep, + NormalizeUnicode, ) from tests.utils import get_hf_tokenizer core = ov.Core() +UNICODE_TEST_FILE_URL = "https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt" + + +class NormalizationTestLine(NamedTuple): + source: str + nfc: str + nfd: str + nfkc: str + nfkd: str + comment: str + +def parse_normalization_test_line(line): + parts, comment = line.split("#", 1) + parts = [part.strip() for part in parts.split(";")] + + # Convert the hexadecimal Unicode code points to characters + def hex_to_char(hex_str): + return "".join(chr(int(code, 16)) for code in hex_str.split()) + + # Parse the components + source = hex_to_char(parts[0]) + nfc = hex_to_char(parts[1]) + nfd = hex_to_char(parts[2]) + nfkc = hex_to_char(parts[3]) + nfkd = hex_to_char(parts[4]) + + return NormalizationTestLine(source, nfc, nfd, nfkc, nfkd, comment) + + +@pytest.fixture(scope="session") +def unicode_normalization_test_data(request): + # check https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt for details + test_file = requests.get(UNICODE_TEST_FILE_URL).text + return [ + parse_normalization_test_line(line) + for line in test_file.split("\n") + if line and not line.startswith("#") and not line.startswith("@") + ] + ############################################ ########## Test Normalizer Step ############ @@ -115,6 +156,31 @@ def precompiled_charsmap_json(request, hf_charsmap_tokenizer): return tj["normalizer"]["normalizers"][0] +@pytest.mark.parametrize( + "test_parameters", + [ + ("NFC", 19875, 90), + ("NFD", 19851, 114), + ("NFKC", 19777, 188), + ("NFKD", 19753, 212), + ] +) +def test_unicode_normalization_model(test_parameters, unicode_normalization_test_data): + normalization_type, positive_threshold, negative_threshold = test_parameters + nfc_normalizer_layer = NormalizeUnicode(normalization_type) + compiled_model = create_normalization_model(nfc_normalizer_layer) + negative = 0 + positive = 0 + for test_input in unicode_normalization_test_data: + res_ov = compiled_model([test_input.source])[0][0].encode() + expected = getattr(test_input, normalization_type.lower()).encode() + positive += res_ov == expected + negative += res_ov != expected + + assert positive == positive_threshold + assert negative == negative_threshold + + @pytest.mark.parametrize("test_string", charsmap_test_strings) def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json): charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json) @@ -140,7 +206,7 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled RegexNormalizationStep( regex_search_pattern=r" ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)", replace_term=r"\1", - ) + ), ), ("", "", RegexNormalizationStep.prepend_regex("▁")), ("\n", "▁\n", RegexNormalizationStep.prepend_regex("▁")), @@ -152,7 +218,7 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled RegexNormalizationStep( regex_search_pattern=r"(^)(.)", replace_term=r"▁\2", - ) + ), ), ( # test backward compatibility with old regex "\n", From 472b163a62b1003537fe007768490399aa4b364e Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 29 Nov 2024 10:47:56 +0000 Subject: [PATCH 06/30] WiP --- .../openvino_tokenizers/tokenizer_pipeline.py | 19 ++++++++- src/CMakeLists.txt | 4 +- src/charsmap_normalization.cpp | 42 +++++++++++++------ src/charsmap_normalization.hpp | 27 +++++++++++- tests/layer_tests.py | 19 ++++----- 5 files changed, 84 insertions(+), 27 deletions(-) diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index fa99f6b8..3e751491 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -155,7 +155,7 @@ class NormalizationStep(BasePipelineStep): @dataclass -class NormalizeUnicode(NormalizationStep): +class _NormalizeUnicode(NormalizationStep): normalization_form: str = "NFD" def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: @@ -168,6 +168,23 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: ) .outputs() ) + pass + + +@dataclass +class NormalizeUnicode(NormalizationStep): + normalization_form: str = "NFD" + + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: + return ( + _get_factory() + .create( + "CharsMapNormalization", + input_nodes, + {"normalization_form": self.normalization_form.lower()}, + ) + .outputs() + ) @dataclass diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2caea5f4..11b11584 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -150,8 +150,8 @@ if(sentencepiece_FOUND) else() FetchContent_Declare( sentencepiece - URL https://github.com/google/sentencepiece/archive/refs/tags/v0.2.0.tar.gz - URL_HASH SHA256=9970f0a0afee1648890293321665e5b2efa04eaec9f1671fcf8048f456f5bb86 + URL https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz + URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886 ) FetchContent_GetProperties(sentencepiece) if(NOT sentencepiece_POPULATED) diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp index d5ff9739..64cdc34b 100644 --- a/src/charsmap_normalization.cpp +++ b/src/charsmap_normalization.cpp @@ -11,8 +11,8 @@ using namespace ov; namespace { -std::shared_ptr make_identity_spec() { - auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec("identity"); +std::shared_ptr make_normalization_spec(const std::string& normalization_form) { + auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec(normalization_form); return std::make_shared(spec); } @@ -21,35 +21,53 @@ std::shared_ptr make_identity_spec() { void CharsMapNormalization::validate_and_infer_types() { auto input_size = get_input_size(); - OPENVINO_ASSERT(input_size == 4 || input_size == 5, "supported input sizes are 4 or 5"); - - const bool has_skips = (input_size == 5); + bool has_skips; + if (m_normalization_form == "") { + OPENVINO_ASSERT(input_size == 4 || input_size == 5, "supported input sizes are 4 or 5 with input spec"); + has_skips = (input_size == 5); + OPENVINO_ASSERT(get_input_element_type(3 + has_skips) == element::u8, "Charsmap normalizer accepts precompiled mapping and it should be of type u8 tensor"); + } else { + OPENVINO_ASSERT(input_size == 3 || input_size == 4, "supported input sizes are 3 or 4 without input spec"); + has_skips = (input_size == 4); + } check_string_input(this, 0); - OPENVINO_ASSERT(get_input_element_type(3 + has_skips) == element::u8, "Charsmap normalizer accepts precompiled mapping and it should be of type u8 tensor"); set_string_output(this, 0, get_input_partial_shape(0)); if (has_skips) { this->set_output_type(3, get_input_element_type(3), get_input_partial_shape(3)); }; + std::cerr << "CharsMapNormalization validation done" << std::endl; } bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - const bool has_skips = (inputs.size() == 5); - { - // Write to common trie structures should be protected to prevent race conditions. + std::cerr << "CharsMapNormalization evaluate" << std::endl; + const bool has_skips = (inputs.size() == 5) || (m_normalization_form != "" && inputs.size() == 4); + std::cerr << "has_skips: " << has_skips << std::endl; + { std::lock_guard lock(m_mutex); if (m_normalizer == nullptr) { - const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); - m_spec = make_identity_spec(); + std::cerr << "CharsMapNormalization creating normalizer" << std::endl; + auto normalization_form = m_normalization_form == "" ? "identity" : m_normalization_form; + + std::cerr << "normalization_form: " << normalization_form << std::endl; + + m_spec = make_normalization_spec(normalization_form); m_spec->set_add_dummy_prefix(m_add_dummy_prefix); m_spec->set_escape_whitespaces(m_escape_whitespaces); - m_spec->set_precompiled_charsmap(precompiled_charsmap); + + if (m_normalization_form == "") { + std::cerr << "CharsMapNormalization setting precompiled_charsmap" << std::endl; + const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); + m_spec->set_precompiled_charsmap(precompiled_charsmap); + }; + m_normalizer = std::make_shared(*m_spec); } } + std::cerr << "CharsMapNormalization evaluating normalization" << std::endl; return evaluate_normalization_helper( outputs, inputs, diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp index ef99d9c1..8ef869d0 100644 --- a/src/charsmap_normalization.hpp +++ b/src/charsmap_normalization.hpp @@ -27,18 +27,40 @@ class CharsMapNormalization : public ov::op::Op { const std::shared_ptr normalizer, const std::shared_ptr spec ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec) { + std::cerr << "CharsMapNormalization constructor" << std::endl; + constructor_validate_and_infer_types(); + } + CharsMapNormalization( + const ov::OutputVector& arguments, + const std::shared_ptr normalizer, + const std::shared_ptr spec, + bool add_dummy_prefix = false, + bool escape_whitespaces = false, + const std::string& normalization_form = "" + ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_normalization_form(normalization_form) { + std::cerr << "CharsMapNormalization constructor2" << std::endl; + constructor_validate_and_infer_types(); + } + CharsMapNormalization( + const ov::OutputVector& arguments, + const std::shared_ptr normalizer, + const std::shared_ptr spec, + const std::string& normalization_form = "" + ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_normalization_form(normalization_form) { + std::cerr << "CharsMapNormalization constructor3" << std::endl; constructor_validate_and_infer_types(); } void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_normalizer, m_spec); + return std::make_shared(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_normalization_form); } bool visit_attributes(ov::AttributeVisitor& visitor) override { visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix); visitor.on_attribute("escape_whitespaces", m_escape_whitespaces); + visitor.on_attribute("normalization_form", m_normalization_form); return true; } @@ -52,8 +74,9 @@ class CharsMapNormalization : public ov::op::Op { bool m_add_dummy_prefix = false; bool m_escape_whitespaces = false; + std::string m_normalization_form = ""; // spec should be preserved for the lifetime of the normalizer mutable std::shared_ptr m_spec; mutable std::mutex m_mutex; -}; \ No newline at end of file +}; diff --git a/tests/layer_tests.py b/tests/layer_tests.py index ddc5be85..f5d8fca9 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -74,7 +74,6 @@ def unicode_normalization_test_data(request): utf8_validate_strings = [ # Valid sequences. b"Eng... test, string?!", - b"Eng... test, string?!", b"\xe2\x82\xac", # Euro sign €ß "Проверка, как работает кириллица Љ љ Ђ ђ".encode(), "測試字符串".encode(), @@ -156,6 +155,15 @@ def precompiled_charsmap_json(request, hf_charsmap_tokenizer): return tj["normalizer"]["normalizers"][0] +@pytest.mark.parametrize("test_string", charsmap_test_strings) +def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json): + charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json) + compiled_model = create_normalization_model(charsmap_normalization_node) + res_ov = compiled_model([test_string])[0][0] + res_hf = hf_charsmap_tokenizer.backend_tokenizer.normalizer.normalize_str(test_string) + assert res_ov == res_hf + + @pytest.mark.parametrize( "test_parameters", [ @@ -181,15 +189,6 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test assert negative == negative_threshold -@pytest.mark.parametrize("test_string", charsmap_test_strings) -def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json): - charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json) - compiled_model = create_normalization_model(charsmap_normalization_node) - res_ov = compiled_model([test_string])[0][0] - res_hf = hf_charsmap_tokenizer.backend_tokenizer.normalizer.normalize_str(test_string) - assert res_ov == res_hf - - @pytest.mark.parametrize( "test_string, expected, layer", [ From 04fb20c5329ddc07cab6d44ee491e3d4dd4fb7c6 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 20 Dec 2024 13:21:30 +0000 Subject: [PATCH 07/30] WiP --- src/CMakeLists.txt | 1 + src/charsmap_normalization.cpp | 50 ++++++++++++++++++---------------- src/charsmap_normalization.hpp | 15 ++++++---- tests/layer_tests.py | 49 +++++++++++++++++++++++---------- 4 files changed, 71 insertions(+), 44 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 11b11584..97aae478 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -174,6 +174,7 @@ else() set(SPM_ENABLE_SHARED OFF CACHE BOOL "") set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "") + set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile") FetchContent_Populate(sentencepiece) add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL) endif() diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp index 64cdc34b..1a6256e8 100644 --- a/src/charsmap_normalization.cpp +++ b/src/charsmap_normalization.cpp @@ -4,20 +4,11 @@ #include "charsmap_normalization.hpp" #include "utils.hpp" -#include "sentencepiece_trainer.h" // for making normalizer spec +#include "builder.h" // for making normalizer spec #include "absl/strings/str_format.h" using namespace ov; -namespace { - -std::shared_ptr make_normalization_spec(const std::string& normalization_form) { - auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec(normalization_form); - return std::make_shared(spec); -} - -} // namespace - void CharsMapNormalization::validate_and_infer_types() { auto input_size = get_input_size(); @@ -37,37 +28,50 @@ void CharsMapNormalization::validate_and_infer_types() { if (has_skips) { this->set_output_type(3, get_input_element_type(3), get_input_partial_shape(3)); }; - std::cerr << "CharsMapNormalization validation done" << std::endl; } bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - std::cerr << "CharsMapNormalization evaluate" << std::endl; const bool has_skips = (inputs.size() == 5) || (m_normalization_form != "" && inputs.size() == 4); - std::cerr << "has_skips: " << has_skips << std::endl; { std::lock_guard lock(m_mutex); if (m_normalizer == nullptr) { - std::cerr << "CharsMapNormalization creating normalizer" << std::endl; - auto normalization_form = m_normalization_form == "" ? "identity" : m_normalization_form; - - std::cerr << "normalization_form: " << normalization_form << std::endl; - - m_spec = make_normalization_spec(normalization_form); + m_spec = std::make_shared(); m_spec->set_add_dummy_prefix(m_add_dummy_prefix); m_spec->set_escape_whitespaces(m_escape_whitespaces); + std::string precompiled_charsmap; if (m_normalization_form == "") { - std::cerr << "CharsMapNormalization setting precompiled_charsmap" << std::endl; - const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); - m_spec->set_precompiled_charsmap(precompiled_charsmap); + precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); + } else if (m_normalization_form == "nfc") { + sentencepiece::normalizer::Builder::CharsMap chars_map; + sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map); + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + } else if (m_normalization_form == "nfd") { + sentencepiece::normalizer::Builder::CharsMap chars_map; + sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map); + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + } else if (m_normalization_form == "nfkc") { + sentencepiece::normalizer::Builder::CharsMap chars_map; + sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map); + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + } else if (m_normalization_form == "nfkd") { + sentencepiece::normalizer::Builder::CharsMap chars_map; + sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map); + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + } else { + OPENVINO_ASSERT(false, "Unsupported normalization form: " + m_normalization_form); }; + std::cerr << "CharsMapNormalization: precompiled_charsmap.size() = " << precompiled_charsmap.size() << std::endl; + std::cerr << "CharsMapNormalization: precompiled_charsmap first 100 chars = " << precompiled_charsmap.substr(0, 100) << std::endl; + + m_spec->set_precompiled_charsmap(precompiled_charsmap); + m_normalizer = std::make_shared(*m_spec); } } - std::cerr << "CharsMapNormalization evaluating normalization" << std::endl; return evaluate_normalization_helper( outputs, inputs, diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp index 8ef869d0..43ab09d3 100644 --- a/src/charsmap_normalization.hpp +++ b/src/charsmap_normalization.hpp @@ -27,7 +27,6 @@ class CharsMapNormalization : public ov::op::Op { const std::shared_ptr normalizer, const std::shared_ptr spec ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec) { - std::cerr << "CharsMapNormalization constructor" << std::endl; constructor_validate_and_infer_types(); } CharsMapNormalization( @@ -36,9 +35,10 @@ class CharsMapNormalization : public ov::op::Op { const std::shared_ptr spec, bool add_dummy_prefix = false, bool escape_whitespaces = false, - const std::string& normalization_form = "" - ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_normalization_form(normalization_form) { - std::cerr << "CharsMapNormalization constructor2" << std::endl; + bool case_fold = false, + const std::string& normalization_form = "", + bool nmt = false + ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){ constructor_validate_and_infer_types(); } CharsMapNormalization( @@ -47,20 +47,21 @@ class CharsMapNormalization : public ov::op::Op { const std::shared_ptr spec, const std::string& normalization_form = "" ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_normalization_form(normalization_form) { - std::cerr << "CharsMapNormalization constructor3" << std::endl; constructor_validate_and_infer_types(); } void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_normalization_form); + return std::make_shared(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt); } bool visit_attributes(ov::AttributeVisitor& visitor) override { visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix); visitor.on_attribute("escape_whitespaces", m_escape_whitespaces); visitor.on_attribute("normalization_form", m_normalization_form); + visitor.on_attribute("case_fold", m_case_fold); + visitor.on_attribute("nmt", m_nmt); return true; } @@ -74,6 +75,8 @@ class CharsMapNormalization : public ov::op::Op { bool m_add_dummy_prefix = false; bool m_escape_whitespaces = false; + bool m_case_fold = false; + bool m_nmt = false; std::string m_normalization_form = ""; // spec should be preserved for the lifetime of the normalizer diff --git a/tests/layer_tests.py b/tests/layer_tests.py index f5d8fca9..59d93c7e 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -57,12 +57,16 @@ def hex_to_char(hex_str): @pytest.fixture(scope="session") -def unicode_normalization_test_data(request): +def icu_test_data(request): + return requests.get(UNICODE_TEST_FILE_URL).text + + +@pytest.fixture(scope="session") +def unicode_normalization_test_data(request, icu_test_data): # check https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt for details - test_file = requests.get(UNICODE_TEST_FILE_URL).text return [ parse_normalization_test_line(line) - for line in test_file.split("\n") + for line in icu_test_data.split("\n") if line and not line.startswith("#") and not line.startswith("@") ] @@ -167,26 +171,41 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled @pytest.mark.parametrize( "test_parameters", [ - ("NFC", 19875, 90), - ("NFD", 19851, 114), - ("NFKC", 19777, 188), - ("NFKD", 19753, 212), + # results for sentencepiece charsmap: + ("NFC", 17325), # failed examples: 2640 + ("NFD", 17736), # failed examples: 2229 + ("NFKC", 17159), # failed examples: 2806 + ("NFKD", 17554), # failed examples: 2411 + # results for icu70: + # ("NFC", 19875), # failed examples: 90 + # ("NFD", 19851), # failed examples: 114 + # ("NFKC", 19777), # failed examples: 188 + # ("NFKD", 19753), # failed examples: 212 + # results for huggingface tokenizers: + # ("NFC", 19247), # failed examples: 718 + # ("NFD", 19220), # failed examples: 745 + # ("NFKC", 19077), # failed examples: 888 + # ("NFKD", 19050), # failed examples: 915 ] ) def test_unicode_normalization_model(test_parameters, unicode_normalization_test_data): - normalization_type, positive_threshold, negative_threshold = test_parameters - nfc_normalizer_layer = NormalizeUnicode(normalization_type) - compiled_model = create_normalization_model(nfc_normalizer_layer) - negative = 0 - positive = 0 + normalization_type, positive_threshold = test_parameters + normalizer_layer = NormalizeUnicode(normalization_type) + compiled_model = create_normalization_model(normalizer_layer) + positive, negative, no_transformation = 0, 0, 0 for test_input in unicode_normalization_test_data: res_ov = compiled_model([test_input.source])[0][0].encode() expected = getattr(test_input, normalization_type.lower()).encode() positive += res_ov == expected negative += res_ov != expected - - assert positive == positive_threshold - assert negative == negative_threshold + no_transformation += test_input.source.encode() == expected + + assert positive == positive_threshold, ( + f"{normalization_type}\n" + f"Positive: {positive}, expected: {positive_threshold}\n" + f"Negative: {negative}, expected: {len(unicode_normalization_test_data) - positive_threshold}\n" + f"No transformation: {no_transformation}, positive delta: {positive - no_transformation}" + ) @pytest.mark.parametrize( From ed1203f2183dfdcb13855694755b1ca62da4f6e4 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 8 Jan 2025 20:03:15 +0000 Subject: [PATCH 08/30] Switch Casefold and UnicodeNormalization to CharsMap --- .../openvino_tokenizers/tokenizer_pipeline.py | 66 +++++++++++++------ src/charsmap_normalization.cpp | 26 ++++---- src/charsmap_normalization.hpp | 7 +- tests/layer_tests.py | 19 ++++++ 4 files changed, 83 insertions(+), 35 deletions(-) diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index 3e751491..66742004 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -155,25 +155,15 @@ class NormalizationStep(BasePipelineStep): @dataclass -class _NormalizeUnicode(NormalizationStep): +class NormalizeUnicode(NormalizationStep): normalization_form: str = "NFD" - def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: - return ( - _get_factory() - .create( - "NormalizeUnicode", - input_nodes, - {"normalization_form": self.normalization_form}, + def __post_init__(self): + if self.normalization_form not in ["NFD", "NFC", "NFKD", "NFKC"]: + raise ValueError( + 'NormalizeUnicode`normalization_form` attribute must be one of ["NFD", "NFC", "NFKD", "NFKC"], ' + f'got {self.normalization_form}.' ) - .outputs() - ) - pass - - -@dataclass -class NormalizeUnicode(NormalizationStep): - normalization_form: str = "NFD" def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: return ( @@ -181,7 +171,10 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: .create( "CharsMapNormalization", input_nodes, - {"normalization_form": self.normalization_form.lower()}, + { + "normalization_form": self.normalization_form.lower(), + "remove_extra_whitespaces": False, + }, ) .outputs() ) @@ -199,7 +192,19 @@ def __post_init__(self): ) def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: - return _get_factory().create("CaseFold", input_nodes, {"encoding": self.encoding}).outputs() + return ( + _get_factory() + .create( + "CharsMapNormalization", + input_nodes, + { + "normalization_form": "identity", + "case_fold": True, + "remove_extra_whitespaces": False, + }, + ) + .outputs() + ) @dataclass @@ -262,7 +267,17 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: @dataclass class CharsmapStep(NormalizationStep): - charsmap: bytes + charsmap: Optional[bytes] = None + normalization_form: Optional[str] = None + add_dummy_prefix: bool = False + remove_extra_whitespaces: bool = True + escape_whitespaces: bool = False + case_fold: bool = False + nmt: bool = False + + def __post_init__(self): + if self.charsmap is None and self.normalization_form is None: + raise ValueError("[ CharsmapStep ] `charsmap` or `normalization_form` attribute must be set") @classmethod def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep": @@ -270,7 +285,18 @@ def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep": def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes += make_constant_node(np.frombuffer(self.charsmap, dtype=np.uint8), dtype=Type.u8).outputs() - return _get_factory().create("CharsMapNormalization", input_nodes).outputs() + return _get_factory().create( + "CharsMapNormalization", + input_nodes, + { + "normalization_form": self.normalization_form or "", + "add_dummy_prefix": self.add_dummy_prefix, + "remove_extra_whitespaces": self.remove_extra_whitespaces, + "escape_whitespaces": self.escape_whitespaces, + "case_fold": self.case_fold, + "nmt": self.nmt, + } + ).outputs() @dataclass diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp index 1a6256e8..5178801a 100644 --- a/src/charsmap_normalization.cpp +++ b/src/charsmap_normalization.cpp @@ -38,34 +38,34 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor if (m_normalizer == nullptr) { m_spec = std::make_shared(); m_spec->set_add_dummy_prefix(m_add_dummy_prefix); + m_spec->set_remove_extra_whitespaces(m_remove_extra_whitespaces); m_spec->set_escape_whitespaces(m_escape_whitespaces); - std::string precompiled_charsmap; - if (m_normalization_form == "") { - precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); + sentencepiece::normalizer::Builder::CharsMap chars_map; + if (m_normalization_form == "identity") { + // no need to modify chars_map } else if (m_normalization_form == "nfc") { - sentencepiece::normalizer::Builder::CharsMap chars_map; sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map); - sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); } else if (m_normalization_form == "nfd") { - sentencepiece::normalizer::Builder::CharsMap chars_map; sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map); - sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); } else if (m_normalization_form == "nfkc") { - sentencepiece::normalizer::Builder::CharsMap chars_map; sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map); - sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); } else if (m_normalization_form == "nfkd") { - sentencepiece::normalizer::Builder::CharsMap chars_map; sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map); - sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); } else { OPENVINO_ASSERT(false, "Unsupported normalization form: " + m_normalization_form); }; - std::cerr << "CharsMapNormalization: precompiled_charsmap.size() = " << precompiled_charsmap.size() << std::endl; - std::cerr << "CharsMapNormalization: precompiled_charsmap first 100 chars = " << precompiled_charsmap.substr(0, 100) << std::endl; + if (m_case_fold) { + sentencepiece::normalizer::Builder::MergeUnicodeCaseFoldMap(&chars_map); + }; + std::string precompiled_charsmap; + if (m_normalization_form == "") { + precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); + } else { + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + } m_spec->set_precompiled_charsmap(precompiled_charsmap); m_normalizer = std::make_shared(*m_spec); diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp index 43ab09d3..a6179b63 100644 --- a/src/charsmap_normalization.hpp +++ b/src/charsmap_normalization.hpp @@ -34,11 +34,12 @@ class CharsMapNormalization : public ov::op::Op { const std::shared_ptr normalizer, const std::shared_ptr spec, bool add_dummy_prefix = false, + bool remove_extra_whitespaces = false, bool escape_whitespaces = false, bool case_fold = false, const std::string& normalization_form = "", bool nmt = false - ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){ + ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_remove_extra_whitespaces(remove_extra_whitespaces), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){ constructor_validate_and_infer_types(); } CharsMapNormalization( @@ -53,11 +54,12 @@ class CharsMapNormalization : public ov::op::Op { void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override { - return std::make_shared(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt); + return std::make_shared(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_remove_extra_whitespaces, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt); } bool visit_attributes(ov::AttributeVisitor& visitor) override { visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix); + visitor.on_attribute("remove_extra_whitespaces", m_remove_extra_whitespaces); visitor.on_attribute("escape_whitespaces", m_escape_whitespaces); visitor.on_attribute("normalization_form", m_normalization_form); visitor.on_attribute("case_fold", m_case_fold); @@ -74,6 +76,7 @@ class CharsMapNormalization : public ov::op::Op { mutable std::shared_ptr m_normalizer; bool m_add_dummy_prefix = false; + bool m_remove_extra_whitespaces = true; bool m_escape_whitespaces = false; bool m_case_fold = false; bool m_nmt = false; diff --git a/tests/layer_tests.py b/tests/layer_tests.py index 59d93c7e..56f87978 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -12,6 +12,7 @@ from openvino_tokenizers import _get_factory from openvino_tokenizers.constants import UTF8ReplaceMode from openvino_tokenizers.tokenizer_pipeline import ( + CaseFoldStep, CharsmapStep, DecodingStep, NormalizationStep, @@ -208,6 +209,24 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test ) + +@pytest.mark.parametrize( + "test_string, expected", + [ + ("a", "a"), + ("A", "a"), + ("Ю", "ю"), + ("Σ", "σ"), + ("Hello World!", "hello world!"), + ] +) +def test_casefold_normalization(test_string, expected): + casefold = CaseFoldStep() + compiled_model = create_normalization_model(casefold) + res_ov = compiled_model([test_string])[0] + assert res_ov == expected + + @pytest.mark.parametrize( "test_string, expected, layer", [ From 012fb8ee1facc3691fd133e5b29504692567d81c Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 9 Jan 2025 12:03:38 +0000 Subject: [PATCH 09/30] Update tests and fix custom charsmap support --- src/charsmap_normalization.cpp | 4 ++-- tests/layer_tests.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp index 5178801a..2efb6752 100644 --- a/src/charsmap_normalization.cpp +++ b/src/charsmap_normalization.cpp @@ -42,7 +42,7 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor m_spec->set_escape_whitespaces(m_escape_whitespaces); sentencepiece::normalizer::Builder::CharsMap chars_map; - if (m_normalization_form == "identity") { + if (m_normalization_form == "identity" || m_normalization_form == "") { // no need to modify chars_map } else if (m_normalization_form == "nfc") { sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map); @@ -53,7 +53,7 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor } else if (m_normalization_form == "nfkd") { sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map); } else { - OPENVINO_ASSERT(false, "Unsupported normalization form: " + m_normalization_form); + OPENVINO_ASSERT(false, "Unsupported normalization form: `" + m_normalization_form + "`"); }; if (m_case_fold) { diff --git a/tests/layer_tests.py b/tests/layer_tests.py index 56f87978..a5079977 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -175,8 +175,8 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled # results for sentencepiece charsmap: ("NFC", 17325), # failed examples: 2640 ("NFD", 17736), # failed examples: 2229 - ("NFKC", 17159), # failed examples: 2806 - ("NFKD", 17554), # failed examples: 2411 + ("NFKC", 17224), # failed examples: 2741 + ("NFKD", 17619), # failed examples: 2346 # results for icu70: # ("NFC", 19875), # failed examples: 90 # ("NFD", 19851), # failed examples: 114 From 80927208afd32fb2dd47d5cd8dc2f9e69537a27d Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 9 Jan 2025 12:09:47 +0000 Subject: [PATCH 10/30] Ruff checks --- .../openvino_tokenizers/tokenizer_pipeline.py | 32 +++++++++++-------- tests/conftest.py | 2 +- tests/layer_tests.py | 14 ++++---- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index 66742004..9394fbae 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -161,8 +161,8 @@ class NormalizeUnicode(NormalizationStep): def __post_init__(self): if self.normalization_form not in ["NFD", "NFC", "NFKD", "NFKC"]: raise ValueError( - 'NormalizeUnicode`normalization_form` attribute must be one of ["NFD", "NFC", "NFKD", "NFKC"], ' - f'got {self.normalization_form}.' + '[ NormalizeUnicode ] `normalization_form` attribute must be one of ["NFD", "NFC", "NFKD", "NFKC"], ' + f"got {self.normalization_form}." ) def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: @@ -285,18 +285,22 @@ def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep": def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes += make_constant_node(np.frombuffer(self.charsmap, dtype=np.uint8), dtype=Type.u8).outputs() - return _get_factory().create( - "CharsMapNormalization", - input_nodes, - { - "normalization_form": self.normalization_form or "", - "add_dummy_prefix": self.add_dummy_prefix, - "remove_extra_whitespaces": self.remove_extra_whitespaces, - "escape_whitespaces": self.escape_whitespaces, - "case_fold": self.case_fold, - "nmt": self.nmt, - } - ).outputs() + return ( + _get_factory() + .create( + "CharsMapNormalization", + input_nodes, + { + "normalization_form": self.normalization_form or "", + "add_dummy_prefix": self.add_dummy_prefix, + "remove_extra_whitespaces": self.remove_extra_whitespaces, + "escape_whitespaces": self.escape_whitespaces, + "case_fold": self.case_fold, + "nmt": self.nmt, + }, + ) + .outputs() + ) @dataclass diff --git a/tests/conftest.py b/tests/conftest.py index f1d3fcfc..717ebb11 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,7 +57,7 @@ def add_tokenizer_type(row): results_df.hf_tiktoken_tokenizers_with_padding_sides_param, inplace=True ) results_df.status = (results_df.status == "passed").astype(int) - results_df = results_df.dropna(subset=['hf_wordpiece_tokenizers_param']) + results_df = results_df.dropna(subset=["hf_wordpiece_tokenizers_param"]) results_df["Model"] = ( results_df.hf_wordpiece_tokenizers_param + ["_legacy" * value for value in results_df.index.str.contains("Slow")] diff --git a/tests/layer_tests.py b/tests/layer_tests.py index a5079977..3cdf68c4 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -2,7 +2,7 @@ import re import tempfile from pathlib import Path -from typing import Union, NamedTuple +from typing import NamedTuple, Union import openvino as ov import pytest @@ -16,12 +16,12 @@ CharsmapStep, DecodingStep, NormalizationStep, + NormalizeUnicode, PreTokenizatinStep, RegexNormalizationStep, RegexSplitStep, TokenizerPipeline, UTF8ValidateStep, - NormalizeUnicode, ) from tests.utils import get_hf_tokenizer @@ -39,6 +39,7 @@ class NormalizationTestLine(NamedTuple): nfkd: str comment: str + def parse_normalization_test_line(line): parts, comment = line.split("#", 1) parts = [part.strip() for part in parts.split(";")] @@ -187,7 +188,7 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled # ("NFD", 19220), # failed examples: 745 # ("NFKC", 19077), # failed examples: 888 # ("NFKD", 19050), # failed examples: 915 - ] + ], ) def test_unicode_normalization_model(test_parameters, unicode_normalization_test_data): normalization_type, positive_threshold = test_parameters @@ -209,7 +210,6 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test ) - @pytest.mark.parametrize( "test_string, expected", [ @@ -218,7 +218,7 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test ("Ю", "ю"), ("Σ", "σ"), ("Hello World!", "hello world!"), - ] + ], ) def test_casefold_normalization(test_string, expected): casefold = CaseFoldStep() @@ -263,9 +263,9 @@ def test_casefold_normalization(test_string, expected): RegexNormalizationStep( regex_search_pattern=r"(^)(.+)", replace_term=r"▁$2", - ) + ), ), - ] + ], ) def test_regex_normalization(test_string, expected, layer): compiled_model = create_normalization_model(layer) From 6a611f345af6a7bf4358393475f793f263ac98d6 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 9 Jan 2025 14:02:34 +0000 Subject: [PATCH 11/30] wip --- .github/workflows/linux.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 5b3d19ac..b2980197 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -109,6 +109,7 @@ jobs: - name: CMake configure - tokenizers run: | + apt install -y libicu-dev source ${INSTALL_DIR}/setupvars.sh cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ From 258f0f49f7dde8eedaa1f31aac03851039a7b3df Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 9 Jan 2025 14:17:03 +0000 Subject: [PATCH 12/30] wip --- .github/workflows/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index b2980197..6573af48 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -109,7 +109,7 @@ jobs: - name: CMake configure - tokenizers run: | - apt install -y libicu-dev + apt-get update && apt install -y libicu-dev source ${INSTALL_DIR}/setupvars.sh cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ From baf0e705831b62fa9f5f2547ee78d7b7bad62df8 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 9 Jan 2025 15:40:59 +0000 Subject: [PATCH 13/30] wip --- .github/workflows/linux.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 6573af48..1295ce0a 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -187,6 +187,7 @@ jobs: - name: Build tokenizers wheel run: | + apt-get update && apt install -y libicu-dev python -m pip wheel -v --no-deps --wheel-dir ${BUILD_DIR} \ --config-settings=override=cross.arch="manylinux_2_31_x86_64" \ --config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \ From 6177b81c805a24bbe6b9194b42a9efe902dbaf9f Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 10 Jan 2025 10:45:34 +0000 Subject: [PATCH 14/30] Switch Off FastTokenizer Support UnicodeNormalization and CaseFold operations using new backend. Do not use FastTokenizer dependency during build. --- .../openvino_tokenizers/tokenizer_pipeline.py | 27 ++++---- src/CMakeLists.txt | 2 +- src/case_fold.cpp | 30 ++++++--- src/case_fold.hpp | 8 ++- src/charsmap_normalization.cpp | 57 +++++++++------- src/normalize_unicode.cpp | 65 ++++++++++++++----- src/normalize_unicode.hpp | 10 +-- src/ov_extension.cpp | 23 ++----- src/tensorflow_translators.cpp | 11 ---- src/tensorflow_translators.hpp | 3 - src/tokenizer.hpp | 3 - tests/layer_tests.py | 21 +++--- 12 files changed, 143 insertions(+), 117 deletions(-) diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index 72ef0d16..c145292e 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -192,19 +192,22 @@ def __post_init__(self): ) def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: - return ( - _get_factory() - .create( - "CharsMapNormalization", - input_nodes, - { - "normalization_form": "identity", - "case_fold": True, - "remove_extra_whitespaces": False, - }, + if self.encoding == "": + return _get_factory().create("CaseFold", input_nodes, {"encoding": self.encoding}).outputs() + else: + return ( + _get_factory() + .create( + "CharsMapNormalization", + input_nodes, + { + "normalization_form": "identity", + "case_fold": True, + "remove_extra_whitespaces": False, + }, + ) + .outputs() ) - .outputs() - ) @dataclass diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 97aae478..942521f9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -82,7 +82,7 @@ else() endif() endif() -cmake_dependent_option(ENABLE_FAST_TOKENIZERS "Enables Fast Tokenizers usage in OpenVINO Tokenizers" ON "FAST_TOKENIZERS_SUPPORTED" OFF) +cmake_dependent_option(ENABLE_FAST_TOKENIZERS "Enables Fast Tokenizers usage in OpenVINO Tokenizers" OFF "FAST_TOKENIZERS_SUPPORTED" OFF) if(ENABLE_FAST_TOKENIZERS) # The option is forced to ON if _GLIBCXX_USE_CXX11_ABI=0 or on Android (where prebuilt version is not available) diff --git a/src/case_fold.cpp b/src/case_fold.cpp index 043248dc..1c0821be 100644 --- a/src/case_fold.cpp +++ b/src/case_fold.cpp @@ -2,12 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#ifdef ENABLE_FAST_TOKENIZERS - #include "case_fold.hpp" #include "utils.hpp" - -#include "fast_tokenizer/normalizers/normalizers.h" +#include "builder.h" // for making normalizer spec using namespace ov; @@ -31,6 +28,24 @@ void CaseFold::validate_and_infer_types() { bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { const bool has_skips = (inputs.size() == 4); + { + std::lock_guard lock(m_mutex); + + if (m_normalizer == nullptr && m_encoding == "utf-8") { + m_spec = std::make_shared(); + m_spec->set_add_dummy_prefix(false); + m_spec->set_remove_extra_whitespaces(true); + m_spec->set_escape_whitespaces(false); + + sentencepiece::normalizer::Builder::CharsMap chars_map; + sentencepiece::normalizer::Builder::MergeUnicodeCaseFoldMap(&chars_map); + std::string precompiled_charsmap; + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + m_spec->set_precompiled_charsmap(precompiled_charsmap); + + m_normalizer = std::make_shared(*m_spec); + } + } if (m_encoding.empty()) { return evaluate_normalization_helper( outputs, inputs, @@ -45,12 +60,9 @@ bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& input return evaluate_normalization_helper( outputs, inputs, - [](const std::string& str) { - using namespace paddlenlp::fast_tokenizer; - return normalizers::NormalizedString(str).Lowercase().GetStr(); + [&](const std::string& str) { + return m_normalizer->Normalize(str); }, has_skips); } } - -#endif // ENABLE_FAST_TOKENIZERS diff --git a/src/case_fold.hpp b/src/case_fold.hpp index 9ae3a75c..bfdf99d5 100644 --- a/src/case_fold.hpp +++ b/src/case_fold.hpp @@ -4,8 +4,7 @@ #pragma once -#ifdef ENABLE_FAST_TOKENIZERS - +#include "normalizer.h" // from sentencepiece #include class CaseFold : public ov::op::Op { @@ -40,6 +39,9 @@ class CaseFold : public ov::op::Op { private: std::string m_encoding = "utf-8"; + mutable std::shared_ptr m_normalizer; + // spec should be preserved for the lifetime of the normalizer + mutable std::shared_ptr m_spec; + mutable std::mutex m_mutex; }; -#endif // ENABLE_FAST_TOKENIZERS diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp index 2efb6752..0d0dae19 100644 --- a/src/charsmap_normalization.cpp +++ b/src/charsmap_normalization.cpp @@ -30,6 +30,31 @@ void CharsMapNormalization::validate_and_infer_types() { }; } + +inline void init_sentencepiece_normalizer_chars_map( + const std::string& normalization_form, + const bool case_fold, + sentencepiece::normalizer::Builder::CharsMap& chars_map +) { + if (normalization_form == "identity") { + // no need to modify chars_map + } else if (normalization_form == "nfc") { + sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map); + } else if (normalization_form == "nfd") { + sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map); + } else if (normalization_form == "nfkc") { + sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map); + } else if (normalization_form == "nfkd") { + sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map); + } else { + OPENVINO_ASSERT(false, "Unsupported normalization form: `" + normalization_form + "`"); + }; + if (case_fold) { + sentencepiece::normalizer::Builder::MergeUnicodeCaseFoldMap(&chars_map); + }; +} + + bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { const bool has_skips = (inputs.size() == 5) || (m_normalization_form != "" && inputs.size() == 4); { @@ -41,31 +66,14 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor m_spec->set_remove_extra_whitespaces(m_remove_extra_whitespaces); m_spec->set_escape_whitespaces(m_escape_whitespaces); - sentencepiece::normalizer::Builder::CharsMap chars_map; - if (m_normalization_form == "identity" || m_normalization_form == "") { - // no need to modify chars_map - } else if (m_normalization_form == "nfc") { - sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map); - } else if (m_normalization_form == "nfd") { - sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map); - } else if (m_normalization_form == "nfkc") { - sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map); - } else if (m_normalization_form == "nfkd") { - sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map); - } else { - OPENVINO_ASSERT(false, "Unsupported normalization form: `" + m_normalization_form + "`"); - }; - - if (m_case_fold) { - sentencepiece::normalizer::Builder::MergeUnicodeCaseFoldMap(&chars_map); - }; - std::string precompiled_charsmap; - if (m_normalization_form == "") { - precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); - } else { + if (m_normalization_form != "") { + sentencepiece::normalizer::Builder::CharsMap chars_map; + init_sentencepiece_normalizer_chars_map(m_normalization_form, m_case_fold, chars_map); sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); - } + } else { + precompiled_charsmap = std::string(inputs[3 + has_skips].data(), inputs[3 + has_skips].get_size()); + }; m_spec->set_precompiled_charsmap(precompiled_charsmap); m_normalizer = std::make_shared(*m_spec); @@ -76,8 +84,7 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor outputs, inputs, [&](const std::string& str) { - auto norm = m_normalizer->Normalize(str); - return norm; + return m_normalizer->Normalize(str); }, has_skips ); diff --git a/src/normalize_unicode.cpp b/src/normalize_unicode.cpp index a8c07f50..39c6c999 100644 --- a/src/normalize_unicode.cpp +++ b/src/normalize_unicode.cpp @@ -2,36 +2,41 @@ // SPDX-License-Identifier: Apache-2.0 // -#ifdef ENABLE_FAST_TOKENIZERS - #ifdef _MSC_VER # pragma warning(disable : 4251) # pragma warning(disable : 4275) #endif -#include "fast_tokenizer/normalizers/normalizers.h" - #include "normalize_unicode.hpp" #include "utils.hpp" +#include "builder.h" // for making normalizer spec using namespace ov; -namespace { -using namespace paddlenlp::fast_tokenizer::normalizers; -using NormalizersMap = std::map>; - -const NormalizersMap normalizers = { - {"NFD", [](const std::string& str) { return NormalizedString(str).NFD().GetStr(); }}, - {"NFC", [](const std::string& str) { return NormalizedString(str).NFC().GetStr(); }}, - {"NFKD", [](const std::string& str) { return NormalizedString(str).NFKD().GetStr(); }}, - {"NFKC", [](const std::string& str) { return NormalizedString(str).NFKC().GetStr(); }}, -}; +inline void init_unicode_normalizer_chars_map( + const std::string& normalization_form, + sentencepiece::normalizer::Builder::CharsMap& chars_map +) { + if (normalization_form == "NFC") { + sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map); + } else if (normalization_form == "NFD") { + sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map); + } else if (normalization_form == "NFKC") { + sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map); + } else if (normalization_form == "NFKD") { + sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map); + } else { + OPENVINO_ASSERT(false, "Unsupported normalization form: `" + normalization_form + "`"); + }; } + void NormalizeUnicode::validate_and_infer_types() { check_string_input(this, 0); - OPENVINO_ASSERT(normalizers.find(m_normalization_form) != normalizers.end(), "NormalizeUnicode doesn't know normalization form ", m_normalization_form); + OPENVINO_ASSERT( + m_normalization_form == "NFC" || m_normalization_form == "NFD" || m_normalization_form == "NFKC" || m_normalization_form == "NFKD", + "NormalizeUnicode doesn't know normalization form ", m_normalization_form); set_string_output(this, 0, get_input_partial_shape(0)); auto input_size = get_input_size(); @@ -44,7 +49,31 @@ void NormalizeUnicode::validate_and_infer_types() { bool NormalizeUnicode::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { const bool has_skips = (inputs.size() == 4); - return evaluate_normalization_helper(outputs, inputs, normalizers.at(m_normalization_form), has_skips); -} -#endif // ENABLE_FAST_TOKENIZERS + { + std::lock_guard lock(m_mutex); + + if (m_normalizer == nullptr) { + m_spec = std::make_shared(); + m_spec->set_add_dummy_prefix(false); + m_spec->set_remove_extra_whitespaces(true); + m_spec->set_escape_whitespaces(false); + + sentencepiece::normalizer::Builder::CharsMap chars_map; + init_unicode_normalizer_chars_map(m_normalization_form, chars_map); + std::string precompiled_charsmap; + sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap); + m_spec->set_precompiled_charsmap(precompiled_charsmap); + + m_normalizer = std::make_shared(*m_spec); + } + } + return evaluate_normalization_helper( + outputs, + inputs, + [&](const std::string& str) { + return m_normalizer->Normalize(str); + }, + has_skips + ); +} diff --git a/src/normalize_unicode.hpp b/src/normalize_unicode.hpp index 12c8043e..04c832d6 100644 --- a/src/normalize_unicode.hpp +++ b/src/normalize_unicode.hpp @@ -4,8 +4,7 @@ #pragma once -#ifdef ENABLE_FAST_TOKENIZERS - +#include "normalizer.h" // from sentencepiece #include class NormalizeUnicode : public ov::op::Op { @@ -38,8 +37,9 @@ class NormalizeUnicode : public ov::op::Op { } private: - std::string m_normalization_form = "NFD"; + mutable std::shared_ptr m_normalizer; + // spec should be preserved for the lifetime of the normalizer + mutable std::shared_ptr m_spec; + mutable std::mutex m_mutex; }; - -#endif // ENABLE_FAST_TOKENIZERS diff --git a/src/ov_extension.cpp b/src/ov_extension.cpp index 7369fe42..5de761b7 100644 --- a/src/ov_extension.cpp +++ b/src/ov_extension.cpp @@ -20,25 +20,10 @@ std::make_shared("Equal", translate_equal), \ std::make_shared("StringToHashBucketFast", translate_string_to_hash_bucket_fast), \ std::make_shared("Squeeze", translate_squeeze_op), \ - std::make_shared("WordpieceTokenizeWithOffsets", translate_wordpiece_tokenize_with_offsets), - -#ifdef ENABLE_FAST_TOKENIZERS - -#define OPENVINO_TOKENIZERS_FAST_TOKENIZER_BASED_EXTENSIONS \ - std::make_shared>(), \ - std::make_shared>(), - -#define OPENVINO_TOKENIZERS_TENSORFLOW_CONVERSION_EXTENSIONS_FAST_TOKENIZER_BASED \ + std::make_shared("WordpieceTokenizeWithOffsets", translate_wordpiece_tokenize_with_offsets), \ std::make_shared("StringLower", translate_string_lower), \ std::make_shared("NormalizeUTF8", translate_normalize_utf8), \ - std::make_shared("CaseFoldUTF8", translate_case_fold_utf8), - -#else - -#define OPENVINO_TOKENIZERS_FAST_TOKENIZER_BASED_EXTENSIONS -#define OPENVINO_TOKENIZERS_TENSORFLOW_CONVERSION_EXTENSIONS_FAST_TOKENIZER_BASED - -#endif // ENABLE_FAST_TOKENIZERS + std::make_shared("CaseFoldUTF8", translate_case_fold_utf8) // clang-format off //! [ov_extension:entry_point] @@ -70,9 +55,9 @@ OPENVINO_CREATE_EXTENSIONS( std::make_shared>(), std::make_shared>(), std::make_shared>(), - OPENVINO_TOKENIZERS_FAST_TOKENIZER_BASED_EXTENSIONS + std::make_shared>(), + std::make_shared>(), OPENVINO_TOKENIZERS_TENSORFLOW_CONVERSION_EXTENSIONS - OPENVINO_TOKENIZERS_TENSORFLOW_CONVERSION_EXTENSIONS_FAST_TOKENIZER_BASED })); //! [ov_extension:entry_point] // clang-format on diff --git a/src/tensorflow_translators.cpp b/src/tensorflow_translators.cpp index e279d752..03c80192 100644 --- a/src/tensorflow_translators.cpp +++ b/src/tensorflow_translators.cpp @@ -23,11 +23,8 @@ #include "string_to_hash_bucket.hpp" #include "vocab_encoder.hpp" #include "wordpiece_tokenizer.hpp" - -#ifdef ENABLE_FAST_TOKENIZERS #include "case_fold.hpp" #include "normalize_unicode.hpp" -#endif // ENABLE_FAST_TOKENIZERS using namespace ov; using namespace ov::op; @@ -156,8 +153,6 @@ NamedOutputVector translate_ragged_tensor_to_sparse(const NodeContext& node) { return named_results; } -#ifdef ENABLE_FAST_TOKENIZERS - ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node) { FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "CaseFold expects only 1 input"); return { post_translate_string_tensor_output(std::make_shared( @@ -171,8 +166,6 @@ ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node) node.get_attribute("normalization_form"))->outputs()) }; } -#endif // ENABLE_FAST_TOKENIZERS - ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node) { auto node_name = node.get_name(); FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "StaticRegexReplace expects only 1 input"); @@ -221,8 +214,6 @@ ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::N return { post_translate_ragged_tensor_output(wp_tokenizer->outputs()) }; } -#ifdef ENABLE_FAST_TOKENIZERS - ov::OutputVector translate_string_lower(const ov::frontend::NodeContext& node) { auto node_name = node.get_name(); FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "StringLower expects only 1 input"); @@ -233,8 +224,6 @@ ov::OutputVector translate_string_lower(const ov::frontend::NodeContext& node) { return { string_lower_result }; } -#endif // ENABLE_FAST_TOKENIZERS - OutputVector translate_lookup_table_find_op(const ov::frontend::NodeContext& node) { FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "LookupTableFind or LookupTableFindV2 expects 3 inputs"); auto table_handle = as_type_ptr(node.get_input_by_reference(0).get_node_shared_ptr()); diff --git a/src/tensorflow_translators.hpp b/src/tensorflow_translators.hpp index 4dbc26b2..250d3c60 100644 --- a/src/tensorflow_translators.hpp +++ b/src/tensorflow_translators.hpp @@ -17,9 +17,6 @@ ov::OutputVector translate_equal(const ov::frontend::NodeContext& node); ov::OutputVector translate_string_to_hash_bucket_fast(const ov::frontend::NodeContext& node); ov::OutputVector translate_squeeze_op(const ov::frontend::NodeContext& node); ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node); - -#ifdef ENABLE_FAST_TOKENIZERS ov::OutputVector translate_string_lower(const ov::frontend::NodeContext& node); ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node); ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node); -#endif // ENABLE_FAST_TOKENIZERS diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp index 343fe032..e5c22314 100644 --- a/src/tokenizer.hpp +++ b/src/tokenizer.hpp @@ -28,10 +28,7 @@ #include "special_tokens_split.hpp" #include "charsmap_normalization.hpp" #include "wordpiece_tokenizer.hpp" - -#ifdef ENABLE_FAST_TOKENIZERS #include "case_fold.hpp" #include "normalize_unicode.hpp" -#endif // ENABLE_FAST_TOKENIZERS #include "tensorflow_translators.hpp" diff --git a/tests/layer_tests.py b/tests/layer_tests.py index 9dd0f9c8..c9d53092 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -211,17 +211,22 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test @pytest.mark.parametrize( - "test_string, expected", + "test_string, expected, is_uft8", [ - ("a", "a"), - ("A", "a"), - ("Ю", "ю"), - ("Σ", "σ"), - ("Hello World!", "hello world!"), + ("a", "a", True), + ("a", "a", False), + ("A", "a", True), + ("A", "a", False), + ("Ю", "ю", True), + ("Ю", "Ю", False), + ("Σ", "σ", True), + ("Σ", "Σ", False), + ("Hello World!", "hello world!", True), + ("Hello World!", "hello world!", False), ], ) -def test_casefold_normalization(test_string, expected): - casefold = CaseFoldStep() +def test_casefold_normalization(test_string, expected, is_uft8): + casefold = CaseFoldStep("utf-8" if is_uft8 else "") compiled_model = create_normalization_model(casefold) res_ov = compiled_model([test_string])[0] assert res_ov == expected From 68b7e4e6f7885bf37e9bc73bcbcd155ccf3e0c0c Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 10 Jan 2025 10:59:01 +0000 Subject: [PATCH 15/30] Delete torch from dependencies --- pyproject.toml | 6 +----- tests/tokenizers_test.py | 2 -- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1cf5ba0d..22ca6ba2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,17 +41,13 @@ transformers = [ "transformers[sentencepiece] >= 4.36.0", "tiktoken" ] -# chatglm2 custom tokenizer file imports torch, have to add torch dependency for tests -torch = [ - 'torch' -] dev = [ "ruff", "bandit", "pytest", "pytest_harvest", "pandas", - "openvino_tokenizers[transformers, torch]" + "openvino_tokenizers[transformers]" ] benchmark = [ "pandas", diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py index 1c021ce7..2cb8356f 100644 --- a/tests/tokenizers_test.py +++ b/tests/tokenizers_test.py @@ -124,8 +124,6 @@ "xlm-roberta-base", "microsoft/deberta-v3-base", "xlnet-base-cased", - # "THUDM/chatglm-6b", # hf_tokenizer init error - # "THUDM/chatglm2-6b", # _pad doesn't support padding side - broke in 4.45 # "THUDM/chatglm3-6b", # _pad doesn't support padding side - broke in 4.45 "t5-base", "facebook/musicgen-small", From 7244191308afe7fa570437ab74bb698a76696373 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 10 Jan 2025 11:15:04 +0000 Subject: [PATCH 16/30] Delete FastTokenizer from cmake and readme --- README.md | 71 ----------------- src/CMakeLists.txt | 193 +-------------------------------------------- 2 files changed, 1 insertion(+), 263 deletions(-) diff --git a/README.md b/README.md index 7678bbb2..dc6c3c52 100644 --- a/README.md +++ b/README.md @@ -150,77 +150,6 @@ make After that, you can transfer all binaries from `build/src` to `` as described in the C++ installation instruction above. -### Reducing the ICU Data Size - -By default, all available ICU locales are supported, which significantly increases the package size. To reduce the size of the ICU libraries included in your final package, follow these steps: - -1. **Use the ICU Data Configuration File**: - - This file specifies which features and locales to include in a custom data bundle. You can find more information [here](https://unicode-org.github.io/icu/userguide/icu_data/buildtool.html#icu-data-configuration-file). - -2. **Set the ICU Data Filter File as an Environment Variable**: - - **On Unix-like systems (Linux, macOS)**: - Set the `ICU_DATA_FILTER_FILE` environment variable to the path of your configuration file (`filters.json`): - - ```bash - export ICU_DATA_FILTER_FILE="filters.json" - ``` - - - **On Windows**: - Set the `ICU_DATA_FILTER_FILE` environment variable using the Command Prompt or PowerShell: - - **Command Prompt:** - ```cmd - set ICU_DATA_FILTER_FILE=filters.json - ``` - - **PowerShell:** - ```powershell - $env:ICU_DATA_FILTER_FILE="filters.json" - ``` - -3. **Create a Configuration File**: - - An example configuration file (`filters.json`) might look like this: - - ```json - { - "localeFilter": { - "filterType": "language", - "includelist": [ - "en" - ] - } - } - ``` - -4. **Configure OpenVINO Tokenizers**: - - When building OpenVINO tokenizers, set the following CMake option during the project configuration: - - ```bash - -DBUILD_FAST_TOKENIZERS=ON - ``` - - Example for a pip installation path: - ```bash - ICU_DATA_FILTER_FILE= pip install git+https://github.com/openvinotoolkit/openvino_tokenizers.git --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS=ON - ``` - -By following these instructions, you can effectively reduce the size of the ICU libraries in your final package. - -### Build OpenVINO Tokenizers without FastTokenizer Library - -If a tokenizer doesn't use `CaseFold`, `UnicodeNormalization` or `Wordpiece` operations, you can drastically reduce package binary size by building OpenVINO Tokenizers without FastTokenizer dependency with this flag: - -```bash --DENABLE_FAST_TOKENIZERS=OFF -``` - -This option can also help with building for platform that is supported by FastTokenizer, for example `Android x86_64`. - -Example for a pip installation path: -```bash - -pip install git+https://github.com/openvinotoolkit/openvino_tokenizers.git --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --config-settings=override=cmake.options.ENABLE_FAST_TOKENIZERS=OFF -``` - ## Usage :warning: OpenVINO Tokenizers can be inferred on a `CPU` device only. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 942521f9..c9277003 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -60,37 +60,6 @@ if("_GLIBCXX_USE_CXX11_ABI=0" IN_LIST OPENVINO_RUNTIME_COMPILE_DEFINITIONS) set(USE_ABI0 ON CACHE BOOL "Set -D_GLIBCXX_USE_CXX11_ABI to 0 for fast_tokenizers") endif() -if(ANDROID) - if(AARCH64 OR ARM) - set(FAST_TOKENIZERS_SUPPORTED ON) - if(ANDROID_NATIVE_API_LEVEL LESS 33) - message(FATAL_ERROR "FastTokenizers require ANDROID_NATIVE_API_LEVEL to be higher than 33. Please, either disable FastTokenizers or set ANDROID_NATIVE_API_LEVEL / ANDROID_PLATFORM") - endif() - elseif(X86_64 OR RISCV64) - message(WARNING "FastTokenizers are not available on ${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}. ENABLE_FAST_TOKENIZERS is set to OFF") - set(FAST_TOKENIZERS_SUPPORTED OFF) - else() - message(WARNING "Unsupport Android ${CMAKE_SYSTEM_PROCESSOR}. Please, contact OpenVINO Tokenizers developers") - endif() - set(FAST_TOKENIZER_FROM_SOURCES ON) -else() - set(FAST_TOKENIZERS_SUPPORTED ON) - if(USE_ABI0 OR (WIN32 AND CMAKE_BUILD_TYPE STREQUAL "Debug")) - set(FAST_TOKENIZER_FROM_SOURCES ON) - else() - set(FAST_TOKENIZER_FROM_SOURCES OFF) - endif() -endif() - -cmake_dependent_option(ENABLE_FAST_TOKENIZERS "Enables Fast Tokenizers usage in OpenVINO Tokenizers" OFF "FAST_TOKENIZERS_SUPPORTED" OFF) - -if(ENABLE_FAST_TOKENIZERS) - # The option is forced to ON if _GLIBCXX_USE_CXX11_ABI=0 or on Android (where prebuilt version is not available) - cmake_dependent_option(BUILD_FAST_TOKENIZERS "Compile core_tokenizers instead of downloading prebuilt library" OFF "NOT FAST_TOKENIZER_FROM_SOURCES" ON) -else() - set(BUILD_FAST_TOKENIZERS OFF CACHE BOOL "Compile core_tokenizers instead of downloading prebuilt library" FORCE) -endif() - # # Compile flags # @@ -122,18 +91,6 @@ if(WIN32 OR APPLE) set(CMAKE_DEBUG_POSTFIX "d") endif() -if(BUILD_FAST_TOKENIZERS) - set(THIRD_PARTY_BUILD_TYPE ${CMAKE_BUILD_TYPE}) - # Set FastTokenizers to use dynamic MSVC runtime - set(MSVC_STATIC_CRT OFF) # PADDLE_LIB - set(PCRE2_STATIC_RUNTIME OFF) # PCRE2_LIB - set(SPM_ENABLE_MSVC_MT_BUILD OFF) # sentencepiece libs - if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - ov_tokenizers_set_flags("CMAKE_CXX_FLAGS_RELEASE;CMAKE_CXX_FLAGS_MINSIZEREL;CMAKE_CXX_FLAGS_RELWITHDEBINFO" "/MD" "/MT") - ov_tokenizers_set_flags("CMAKE_CXX_FLAGS_DEBUG" "/MDd" "/MT") - endif() -endif() - # # Dependencies # @@ -235,114 +192,7 @@ function(ov_tokenizers_build_static_re2) target_compile_definitions(re2 PUBLIC $) endfunction() -if(BUILD_FAST_TOKENIZERS) - set(EXTERNAL_PROJECT_SOURCE_DIR ${CMAKE_BINARY_DIR}/_deps/fast_tokenizer/src) - set(EXTERNAL_PROJECT_BINARY_DIR ${CMAKE_BINARY_DIR}/_deps/fast_tokenizer/build) - set(EXTERNAL_PROJECT_SUBBUILD_DIR ${CMAKE_BINARY_DIR}/_deps/fast_tokenizer/sub-build) - - FetchContent_Declare( - fast_tokenizer - URL https://github.com/PaddlePaddle/PaddleNLP/archive/refs/tags/v2.6.1.tar.gz - URL_HASH SHA256=10e3489bc91e938c449a0448fa719e4536803ed6b1c1c95b3402430d6a8a221a - PATCH_COMMAND git --git-dir=${EXTERNAL_PROJECT_SOURCE_DIR} apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/patches/fast_tokenizers.patch" && - git --git-dir=${EXTERNAL_PROJECT_SOURCE_DIR} apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/patches/glog.patch" && - git --git-dir=${EXTERNAL_PROJECT_SOURCE_DIR} apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/patches/gflags.patch" && - git --git-dir=${EXTERNAL_PROJECT_SOURCE_DIR} apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/patches/icu.patch" - SOURCE_DIR ${EXTERNAL_PROJECT_SOURCE_DIR} - BINARY_DIR ${EXTERNAL_PROJECT_BINARY_DIR} - SUBBUILD_DIR ${EXTERNAL_PROJECT_SUBBUILD_DIR} - ) - - FetchContent_GetProperties(fast_tokenizer) - if(NOT fast_tokenizer_POPULATED) - FetchContent_Populate( - fast_tokenizer - ) - set(EXTERNAL_OPTIONAL_ARGS - -DCMAKE_POLICY_DEFAULT_CMP0057=NEW - -DCMAKE_POLICY_DEFAULT_CMP0135=NEW) - set(WITH_PYTHON OFF) - set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) - add_subdirectory(${fast_tokenizer_SOURCE_DIR}/fast_tokenizer - ${CMAKE_CURRENT_BINARY_DIR}/fast_tokenizer - EXCLUDE_FROM_ALL - ) - endif() - - # variables used later - set(FAST_TOKENIZER_INCS - "${fast_tokenizer_SOURCE_DIR}/fast_tokenizer" - "${CMAKE_BINARY_DIR}/third_party/dart/src/extern_dart/include/" - "${CMAKE_BINARY_DIR}/third_party/json/src/extern_json/single_include/" - "${CMAKE_BINARY_DIR}/third_party/install/re2/include/") - set(FAST_TOKENIZER_LIBS core_tokenizers) -elseif(ENABLE_FAST_TOKENIZERS) - if(WIN32 AND X86_64) - FetchContent_Declare( - fast_tokenizer - URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-win-x64-1.0.2.zip - URL_HASH SHA256=56470954014bdd3c8c8ad702d20f5f6aa5ab913bff92fd9c3c49ec6da31ff11d - ) - ov_tokenizers_build_static_re2() - elseif(LINUX AND X86_64) - FetchContent_Declare( - fast_tokenizer - URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.2.tgz - URL_HASH SHA256=843a8299b55ef2e06ea50ba0d4ab4cb05b9e4cdb7cb8e29f3d55c494a1b7aecc - ) - elseif(LINUX AND AARCH64) - FetchContent_Declare( - fast_tokenizer - URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-aarch64-1.0.2.tgz - URL_HASH SHA256=fc16c51b24a954ae3d659e1b233ce15349eafc1e4c72710b51a4f12fb2c03033 - ) - elseif(APPLE AND X86_64) - FetchContent_Declare( - fast_tokenizer - URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-x86_64-1.0.2.tgz - # TODO: restore once https://github.com/PaddlePaddle/PaddleNLP/issues/7505 is fixed - # URL_HASH SHA256=4c8123ad941b3e4325ef72f328db545e34d5eec2de3e2545e1ab8ebeeb5146a9 - ) - elseif(APPLE AND AARCH64) - FetchContent_Declare( - fast_tokenizer - URL https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-arm64-1.0.2.tgz - URL_HASH SHA256=ffb0f16ec96b2f5dbdb681d00d74e932e273ec1c2108196d13f2fd28abc4d266 - ) - else() - message(FATAL_ERROR "Platform ${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR} does not have prebuilt Fast Tokenizer" - "Please, use -DBUILD_FAST_TOKENIZERS=ON cmake option to enable build from soures") - endif() - - FetchContent_MakeAvailable(fast_tokenizer) - - # to allow find_library to work with conda-forge env - set(_old_CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ${CMAKE_FIND_ROOT_PATH_MODE_LIBRARY}) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) - include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake") - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ${_old_CMAKE_FIND_ROOT_PATH_MODE_LIBRARY}) - # since FastTokenizers.cmake overrides C++ standard, let's override it once again to required one - ov_tokenizers_set_cxx_standard() - - if(WIN32 AND X86_64) - # we use re2 library in regex_normalization operation, so have to add to this list - # because prebuilt fast_tokenizers package does not provide this library - list(APPEND FAST_TOKENIZER_LIBS re2) - endif() -else() - # in case if we don't build fast tokenizers, we have to include re2 explicitly - ov_tokenizers_build_static_re2() -endif() - -function(ov_tokenizers_link_fast_tokenizer TARGET_NAME) - if(ENABLE_FAST_TOKENIZERS) - target_include_directories(${TARGET_NAME} SYSTEM PRIVATE ${FAST_TOKENIZER_INCS}) - target_link_libraries(${TARGET_NAME} PRIVATE ${FAST_TOKENIZER_LIBS}) - target_compile_definitions(${TARGET_NAME} PRIVATE ENABLE_FAST_TOKENIZERS) - else() - message(FATAL_ERROR "ENABLE_FAST_TOKENIZERS is turned off. This function must not be called") - endif() -endfunction() +ov_tokenizers_build_static_re2() function(ov_tokenizers_link_pcre2 TARGET_NAME) FetchContent_Declare( @@ -369,9 +219,7 @@ function(ov_tokenizers_link_pcre2 TARGET_NAME) endfunction() function(ov_tokenizers_link_re2 TARGET_NAME) - if(NOT ENABLE_FAST_TOKENIZERS) target_link_libraries(${TARGET_NAME} PRIVATE re2) - endif() endfunction() # @@ -387,9 +235,6 @@ add_library(${TARGET_NAME} SHARED ${SRCS}) # ov_tokenizers_link_sentencepiece(${TARGET_NAME}) -if(ENABLE_FAST_TOKENIZERS) - ov_tokenizers_link_fast_tokenizer(${TARGET_NAME}) -endif() ov_tokenizers_link_pcre2(${TARGET_NAME}) ov_tokenizers_link_re2(${TARGET_NAME}) @@ -399,36 +244,6 @@ set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_OPTIONS "${extra_flags}" target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_OPENVINO_EXTENSION_API) target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime openvino::threading) -# -# Post build steps to copy core_tokenizers dependencies -# - -if(ENABLE_FAST_TOKENIZERS) - if(BUILD_FAST_TOKENIZERS) - set(fast_tokenezers_libs_dir "${CMAKE_BINARY_DIR}/third_party/icu/src/extern_icu/icu4c/bin64") - else() - set(fast_tokenezers_libs_dir "${fast_tokenizer_SOURCE_DIR}/third_party/lib") - if(WIN32 AND X86_64) - set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll") - elseif(LINUX) - set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/libcore_tokenizers.so") - elseif(APPLE) - set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/libcore_tokenizers.dylib") - endif() - endif() - - if(WIN32 AND X86_64) - list(APPEND extra_libs "${fast_tokenezers_libs_dir}/icudt70.dll" - "${fast_tokenezers_libs_dir}/icuuc70$<$:${CMAKE_DEBUG_POSTFIX}>.dll") - endif() - - if(extra_libs) - # post build steps - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${extra_libs} $) - endif() -endif() - # # Set install RPATH # @@ -491,12 +306,6 @@ install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION ${OPENVINO_TOKENIZERS_INSTALL_LIBDIR} COMPONENT openvino_tokenizers RUNTIME DESTINATION ${OPENVINO_TOKENIZERS_INSTALL_BINDIR} COMPONENT openvino_tokenizers) -if(BUILD_FAST_TOKENIZERS) - install(TARGETS core_tokenizers - LIBRARY DESTINATION ${OPENVINO_TOKENIZERS_INSTALL_LIBDIR} COMPONENT openvino_tokenizers - RUNTIME DESTINATION ${OPENVINO_TOKENIZERS_INSTALL_BINDIR} COMPONENT openvino_tokenizers) -endif() - if(extra_libs) if(WIN32) set(extra_libs_location ${OPENVINO_TOKENIZERS_INSTALL_BINDIR}) From 082064c6b05bdf22ddfa6664369015e06c9612f1 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 10 Jan 2025 11:16:27 +0000 Subject: [PATCH 17/30] Delete FastTokenizer related patches --- src/icu_filter_en.json | 8 --- src/patches/fast_tokenizers.patch | 72 ----------------------- src/patches/gflags.patch | 15 ----- src/patches/glog.patch | 47 --------------- src/patches/icu.patch | 95 ------------------------------- 5 files changed, 237 deletions(-) delete mode 100644 src/icu_filter_en.json delete mode 100644 src/patches/fast_tokenizers.patch delete mode 100644 src/patches/gflags.patch delete mode 100644 src/patches/glog.patch delete mode 100644 src/patches/icu.patch diff --git a/src/icu_filter_en.json b/src/icu_filter_en.json deleted file mode 100644 index d7406489..00000000 --- a/src/icu_filter_en.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "localeFilter": { - "filterType": "language", - "includelist": [ - "en" - ] - } -} diff --git a/src/patches/fast_tokenizers.patch b/src/patches/fast_tokenizers.patch deleted file mode 100644 index 6629b4af..00000000 --- a/src/patches/fast_tokenizers.patch +++ /dev/null @@ -1,72 +0,0 @@ -diff --git a/fast_tokenizer/cmake/ByproductsICU.cmake b/fast_tokenizer/cmake/ByproductsICU.cmake -index 3b68f082..6ae7e8f0 100644 ---- a/fast_tokenizer/cmake/ByproductsICU.cmake -+++ b/fast_tokenizer/cmake/ByproductsICU.cmake -@@ -15,14 +15,14 @@ - # See the License for the specific language governing permissions and - # limitations under the License. - --function(GetICUByproducts ICU_PATH ICU_LIB_VAR ICU_INCLUDE_VAR ICU_BASE_NAMES_VAR) -+function(GetICUByproducts ICU_PATH ICU_LIB_VAR ICU_INCLUDE_VAR ICU_BASE_NAMES_VAR ICU_LIB_POSTFIX) - # include directory - set(${ICU_INCLUDE_VAR} "${ICU_PATH}/include" PARENT_SCOPE) -- -+ - if (WIN32) - # windows basenames and pre/suffixes - set(ICU_LIB_BASE_NAMES dt in io tu uc) -- -+ - set(ICU_SHARED_PREFIX "lib") - set(ICU_STATIC_PREFIX "") - set(ICU_SHARED_SUFFIX ".dll.a") -@@ -39,9 +39,14 @@ function(GetICUByproducts ICU_PATH ICU_LIB_VAR ICU_INCLUDE_VAR ICU_BASE_NAMES_VA - endif() - # add static and shared libs to the libraries variable - foreach(ICU_BASE_NAME ${ICU_LIB_BASE_NAMES}) -- set(ICU_SHARED_LIB "${ICU_PATH}/${ICU_INSTALL_LIB}/${ICU_SHARED_PREFIX}icu${ICU_BASE_NAME}${ICU_SHARED_SUFFIX}") -- set(ICU_STATIC_LIB "${ICU_PATH}/${ICU_INSTALL_LIB}/${ICU_STATIC_PREFIX}icu${ICU_BASE_NAME}${ICU_STATIC_SUFFIX}") -- -+ if(ICU_BASE_NAME STREQUAL "dt") -+ set(ICU_NAME "${ICU_BASE_NAME}") -+ else() -+ set(ICU_NAME "${ICU_BASE_NAME}${ICU_LIB_POSTFIX}") -+ endif() -+ set(ICU_SHARED_LIB "${ICU_PATH}/${ICU_INSTALL_LIB}/${ICU_SHARED_PREFIX}icu${ICU_NAME}${ICU_SHARED_SUFFIX}") -+ set(ICU_STATIC_LIB "${ICU_PATH}/${ICU_INSTALL_LIB}/${ICU_STATIC_PREFIX}icu${ICU_NAME}${ICU_STATIC_SUFFIX}") -+ - if (ICU_STATIC) - list(APPEND ${ICU_LIB_VAR} ${ICU_STATIC_LIB}) - else() - -diff --git a/fast_tokenizer/CMakeLists.txt b/fast_tokenizer/CMakeLists.txt -index ce238239..39f34fa4 100644 ---- a/fast_tokenizer/CMakeLists.txt -+++ b/fast_tokenizer/CMakeLists.txt -@@ -51,7 +51,7 @@ else() - set(CMAKE_CXX_STANDARD 11) - endif() - --IF(WIN32) -+IF(WIN32 AND MSVC_STATIC_CRT) - # Need to add flags for windows - foreach( - flag_var -@@ -126,7 +126,7 @@ set(${flag_var} - set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB") - endforeach() - --ELSE(WIN32) -+ELSE() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC") - IF (NOT APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ldl") -@@ -137,7 +137,7 @@ ELSE(WIN32) - ENDIF() - ENDIF() - set (PUBLIC_DEPEND_LIBS ${CMAKE_DL_LIBS}) --ENDIF(WIN32) -+ENDIF() - - set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR}) - set(TOKENIZERS_INSTALL_INCLUDE_DIR ${PROJECT_SOURCE_DIR}) diff --git a/src/patches/gflags.patch b/src/patches/gflags.patch deleted file mode 100644 index 0217c11c..00000000 --- a/src/patches/gflags.patch +++ /dev/null @@ -1,15 +0,0 @@ -diff --git a/fast_tokenizer/cmake/external/gflags.cmake b/fast_tokenizer/cmake/external/gflags.cmake -index df5b3642..fcf385d8 100644 ---- a/fast_tokenizer/cmake/external/gflags.cmake -+++ b/fast_tokenizer/cmake/external/gflags.cmake -@@ -23,8 +23,8 @@ IF(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) - ELSE(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) -- set(BUILD_COMMAND $(MAKE) --silent) -- set(INSTALL_COMMAND $(MAKE) install) -+ set(BUILD_COMMAND ${CMAKE_COMMAND} --build .) -+ set(INSTALL_COMMAND ${CMAKE_COMMAND} --install .) - ENDIF(WIN32) - - INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) diff --git a/src/patches/glog.patch b/src/patches/glog.patch deleted file mode 100644 index 6e40a08e..00000000 --- a/src/patches/glog.patch +++ /dev/null @@ -1,47 +0,0 @@ -diff --git a/fast_tokenizer/cmake/external/glog.cmake b/fast_tokenizer/cmake/external/glog.cmake -index 2afc3960..fc2b21ce 100644 ---- a/fast_tokenizer/cmake/external/glog.cmake -+++ b/fast_tokenizer/cmake/external/glog.cmake -@@ -21,17 +21,29 @@ SET(GLOG_REPOSITORY ${GIT_URL}/google/glog.git) - SET(GLOG_TAG v0.4.0) - - IF(WIN32) -- SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog library." FORCE) -+ SET(GLOG_LIBRARIES_RELEASE "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog release library." FORCE) -+ SET(GLOG_LIBRARIES_DEBUG "${GLOG_INSTALL_DIR}/lib/glogd.lib" CACHE FILEPATH "glog debug library." FORCE) - SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") - add_definitions("/DGOOGLE_GLOG_DLL_DECL=") - ELSE(WIN32) -- SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) -+ SET(GLOG_LIBRARIES_RELEASE "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog release library." FORCE) -+ SET(GLOG_LIBRARIES_DEBUG "${GLOG_INSTALL_DIR}/lib/libglogd.a" CACHE FILEPATH "glog release library." FORCE) - SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) - ENDIF(WIN32) - -+if(GENERATOR_IS_MULTI_CONFIG_VAR) -+ set(GLOG_LIBRARIES "$:${GLOG_LIBRARIES_DEBUG} $:${GLOG_LIBRARIES_RELEASE}") -+else() -+ if(CMAKE_BUILD_TYPE STREQUAL "Debug") -+ set(GLOG_LIBRARIES "${GLOG_LIBRARIES_DEBUG}") -+ else() -+ set(GLOG_LIBRARIES "${GLOG_LIBRARIES_RELEASE}") -+ endif() -+endif() -+ - INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) - --IF(ANDROID) -+IF(ANDROID) - set(CROSS_COMPILE_CMAKE_ARGS - "-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}" - "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}" -@@ -112,6 +124,7 @@ ExternalProject_Add( - ENDIF() - - ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) --SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) -+SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION "${GLOG_LIBRARIES_RELEASE}") -+SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION_DEBUG "${GLOG_LIBRARIES_DEBUG}") - ADD_DEPENDENCIES(glog extern_glog gflags) - LINK_LIBRARIES(glog) -\ No newline at end of file diff --git a/src/patches/icu.patch b/src/patches/icu.patch deleted file mode 100644 index 203d6688..00000000 --- a/src/patches/icu.patch +++ /dev/null @@ -1,95 +0,0 @@ -diff --git a/fast_tokenizer/cmake/external/icu.cmake b/fast_tokenizer/cmake/external/icu.cmake -index cd604d38..a949a156 100644 ---- a/fast_tokenizer/cmake/external/icu.cmake -+++ b/fast_tokenizer/cmake/external/icu.cmake -@@ -50,11 +50,33 @@ set(HOST_ENV_CMAKE ${CMAKE_COMMAND} -E env - LDFLAGS=${HOST_LDFLAGS} - ) - -+if(WIN32) -+ set(CMAKE_DEBUG_POSTFIX "d") -+else() -+ set(CMAKE_DEBUG_POSTFIX "") -+endif() -+ - # predict host libraries - set(ICU_STATIC TRUE) --GetICUByproducts(${ICU_INSTALL_DIR} ICU_LIBRARIES ICU_INCLUDE_DIRS ICU_BASE_NAMES) -+GetICUByproducts(${ICU_INSTALL_DIR} ICU_LIBRARIES_RELEASE ICU_INCLUDE_DIRS ICU_BASE_NAMES "") -+GetICUByproducts(${ICU_INSTALL_DIR} ICU_LIBRARIES_DEBUG ICU_INCLUDE_DIRS ICU_BASE_NAMES "${CMAKE_DEBUG_POSTFIX}") - INCLUDE_DIRECTORIES(${ICU_INCLUDE_DIRS}) - -+if(GENERATOR_IS_MULTI_CONFIG_VAR) -+ set(ICU_LIBRARIES "$:${ICU_LIBRARIES_DEBUG} $:${ICU_LIBRARIES_RELEASE}") -+ set(ICU_CONFIGURE_FLAGS $<$:"--enable-debug">$<$:"--enable-release">) -+ set(ICU_BUILD_TYPE $) -+else() -+ if(CMAKE_BUILD_TYPE STREQUAL "Debug") -+ set(ICU_LIBRARIES "${ICU_LIBRARIES_DEBUG}") -+ set(ICU_CONFIGURE_FLAGS "--enable-debug") -+ else() -+ set(ICU_LIBRARIES "${ICU_LIBRARIES_RELEASE}") -+ set(ICU_CONFIGURE_FLAGS "--enable-release") -+ endif() -+ set(ICU_BUILD_TYPE ${CMAKE_BUILD_TYPE}) -+endif() -+ - if(WIN32) - ExternalProject_Add( - extern_icu -@@ -65,7 +87,7 @@ ExternalProject_Add( - GIT_PROGRESS 1 - PREFIX ${ICU_PREFIX_DIR} - UPDATE_COMMAND "" -- CONFIGURE_COMMAND msbuild ..\\extern_icu\\icu4c\\source\\allinone\\allinone.sln /p:Configuration=Release /p:Platform=x64 /p:RuntimeLibrary=MT_StaticRelease /p:SkipUWP=true -+ CONFIGURE_COMMAND msbuild ..\\extern_icu\\icu4c\\source\\allinone\\allinone.sln /p:Configuration=${ICU_BUILD_TYPE} /p:Platform=x64 /p:SkipUWP=true - BUILD_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ../extern_icu/icu4c/include ${ICU_INSTALL_DIR}/include - && ${CMAKE_COMMAND} -E copy_directory ../extern_icu/icu4c/lib64 ${ICU_INSTALL_DIR}/lib64 -@@ -81,7 +103,7 @@ ExternalProject_Add( - GIT_PROGRESS 1 - PREFIX ${ICU_PREFIX_DIR} - UPDATE_COMMAND "" -- CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "MacOSX/GCC" --enable-static --disable-shared --enable-rpath -+ CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "MacOSX/GCC" ${ICU_CONFIGURE_FLAGS} --enable-static --enable-rpath - BUILD_COMMAND make -j4 - INSTALL_COMMAND make install prefix="" DESTDIR=${ICU_INSTALL_DIR} install - BUILD_BYPRODUCTS ${ICU_LIBRARIES} -@@ -98,7 +120,7 @@ ExternalProject_Add( - BUILD_COMMAND "" - INSTALL_COMMAND - ${CMAKE_COMMAND} -E remove_directory ${ICU_INSTALL_DIR} && -- ${CMAKE_COMMAND} -E make_directory ${ICU_INSTALL_DIR} && -+ ${CMAKE_COMMAND} -E make_directory ${ICU_INSTALL_DIR} && - ${CMAKE_COMMAND} -E rename ${ICU_PREFIX_DIR}/src/extern_icu/lib/ ${ICU_INSTALL_DIR}/lib && - ${CMAKE_COMMAND} -E copy_directory ${ICU_PREFIX_DIR}/src/extern_icu/include ${ICU_INSTALL_DIR}/include - BUILD_BYPRODUCTS ${ICU_LIBRARIES} -@@ -113,22 +135,24 @@ ExternalProject_Add( - GIT_PROGRESS 1 - PREFIX ${ICU_PREFIX_DIR} - UPDATE_COMMAND "" -- CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "Linux/gcc" --enable-static --disable-shared --enable-rpath -+ CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "Linux" ${ICU_CONFIGURE_FLAGS} --enable-static --enable-rpath - BUILD_COMMAND make -j4 - INSTALL_COMMAND make install prefix="" DESTDIR=${ICU_INSTALL_DIR} install - BUILD_BYPRODUCTS ${ICU_LIBRARIES} - ) - endif() - --list(LENGTH ICU_LIBRARIES ICU_LIB_LEN) -+list(LENGTH ICU_LIBRARIES_RELEASE ICU_LIB_LEN) - MATH(EXPR ICU_LIB_LEN "${ICU_LIB_LEN}-1") - - # icui18n icudata icuuc icuio icutu - foreach(ICU_IDX RANGE ${ICU_LIB_LEN}) -- list(GET ICU_LIBRARIES ${ICU_IDX} ICU_LIB) -+ list(GET ICU_LIBRARIES_RELEASE ${ICU_IDX} ICU_LIB_RELEASE) -+ list(GET ICU_LIBRARIES_DEBUG ${ICU_IDX} ICU_LIB_DEBUG) - list(GET ICU_BASE_NAMES ${ICU_IDX} ICU_BASE_NAME) - ADD_LIBRARY("icu${ICU_BASE_NAME}" STATIC IMPORTED GLOBAL) -- SET_PROPERTY(TARGET "icu${ICU_BASE_NAME}" PROPERTY IMPORTED_LOCATION ${ICU_LIB}) -+ SET_PROPERTY(TARGET "icu${ICU_BASE_NAME}" PROPERTY IMPORTED_LOCATION ${ICU_LIB_RELEASE}) -+ SET_PROPERTY(TARGET "icu${ICU_BASE_NAME}" PROPERTY IMPORTED_LOCATION_DEBUG ${ICU_LIB_DEBUG}) - ADD_DEPENDENCIES("icu${ICU_BASE_NAME}" extern_icu) - list(APPEND ICU_INTERFACE_LINK_LIBRARIES "icu${ICU_BASE_NAME}") - endforeach() From 7380898f41b94aeb163ef3578124f174cd0d9394 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 10 Jan 2025 11:58:10 +0000 Subject: [PATCH 18/30] Delete FastTokenizer build form CI --- .github/workflows/linux.yml | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 1295ce0a..593b0454 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -63,10 +63,9 @@ jobs: openvino_tokenizers_cpack: - name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }}) + name: OpenVINO tokenizers cpack, BUILD_TYPE=${{ matrix.build_type }}) strategy: matrix: - build_fast_tokenizers: [ON] build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package needs: [ openvino_download ] if: | @@ -111,8 +110,7 @@ jobs: run: | apt-get update && apt install -y libicu-dev source ${INSTALL_DIR}/setupvars.sh - cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -S ${{ env.OPENVINO_TOKENIZERS_REPO }} \ -B ${{ env.BUILD_DIR }} @@ -139,15 +137,13 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: - name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }} + name: openvino_tokenizers_cpack_${{ matrix.build_type }} path: ${{ env.BUILD_DIR }}/*.tar.gz if-no-files-found: 'error' openvino_tokenizers_wheel: - name: OpenVINO tokenizers extension (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}) - strategy: - matrix: - build_fast_tokenizers: [ON, OFF] + name: OpenVINO tokenizers extension wheel + needs: [ openvino_download ] if: | always() && @@ -190,7 +186,6 @@ jobs: apt-get update && apt install -y libicu-dev python -m pip wheel -v --no-deps --wheel-dir ${BUILD_DIR} \ --config-settings=override=cross.arch="manylinux_2_31_x86_64" \ - --config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \ ${{ needs.openvino_download.outputs.ov_wheel_source }} \ ${OPENVINO_TOKENIZERS_REPO} env: @@ -206,15 +201,12 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: - name: openvino_tokenizers_wheel_${{ matrix.build_fast_tokenizers }} + name: openvino_tokenizers_wheel path: ${{ env.BUILD_DIR }}/*.whl if-no-files-found: 'error' openvino_tokenizers_tests: - name: OpenVINO tokenizers tests (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}) - strategy: - matrix: - build_fast_tokenizers: [ON, OFF] + name: OpenVINO tokenizers tests needs: [ openvino_download, openvino_tokenizers_wheel] if: always() && needs.openvino_tokenizers_wheel.result == 'success' timeout-minutes: 45 @@ -244,7 +236,7 @@ jobs: - name: Download tokenizers package uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: - name: openvino_tokenizers_wheel_${{ matrix.build_fast_tokenizers }} + name: openvino_tokenizers_wheel path: ${{ env.INSTALL_DIR }}/ov_tokenizers - name: Download OpenVINO package From 68d0300b0126899918d0ae636b57bd7ba6115da2 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 10 Jan 2025 13:18:39 +0000 Subject: [PATCH 19/30] Delete FastTokenizer build form CI --- .github/workflows/mac.yml | 8 +++----- .github/workflows/windows.yml | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index bec1cd7a..0f6a78b5 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -176,10 +176,9 @@ jobs: if-no-files-found: 'error' openvino_tokenizers_cpack: - name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }}) + name: OpenVINO tokenizers cpack (BUILD_TYPE=${{ matrix.build_type }}) strategy: matrix: - build_fast_tokenizers: [ON] build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package needs: [ openvino_download, openvino_build ] if: | @@ -226,8 +225,7 @@ jobs: - name: CMake configure - tokenizers run: | source ${INSTALL_DIR}/setupvars.sh - cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -S ${{ env.OPENVINO_TOKENIZERS_REPO }} \ -B ${{ env.BUILD_DIR }} @@ -254,7 +252,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: - name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }} + name: openvino_tokenizers_cpack_${{ matrix.build_type }} path: ${{ env.BUILD_DIR }}/*.tar.gz if-no-files-found: 'error' diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index ce5b79b8..539229e1 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -47,10 +47,9 @@ jobs: revision: 'latest_available_commit' openvino_tokenizers_cpack: - name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }}) + name: OpenVINO tokenizers cpack (BUILD_TYPE=${{ matrix.build_type }}) strategy: matrix: - build_fast_tokenizers: [ON] build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package needs: [ openvino_download ] if: | @@ -115,8 +114,7 @@ jobs: shell: pwsh run: | ${{ env.OV_INSTALL_DIR }}/setupvars.ps1 - cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" ` - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ` + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ` -S ${{ env.OPENVINO_TOKENIZERS_REPO }} ` -B ${{ env.BUILD_DIR }} env: @@ -149,7 +147,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: - name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }} + name: openvino_tokenizers_cpack_${{ matrix.build_type }} path: ${{ env.BUILD_DIR }}/*.zip if-no-files-found: 'error' From fc094a027d3798fcbe113790235c51300f50cb58 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 10 Jan 2025 14:45:53 +0000 Subject: [PATCH 20/30] Delete FastTokenizer from Cmake --- pyproject.toml | 1 + src/CMakeLists.txt | 64 +++++++++++++++++++--------------------------- 2 files changed, 27 insertions(+), 38 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 22ca6ba2..56cfbf5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ [project.optional-dependencies] transformers = [ "transformers[sentencepiece] >= 4.36.0", + "jinja2", # has to be installed for sentencepiece-based transformers tokenizers to work "tiktoken" ] dev = [ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c9277003..0788fabb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,9 +56,6 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) # That prohibits linkage with prebuilt libraries because they aren't compiled with _GLIBCXX_USE_CXX11_ABI=0. get_directory_property(OPENVINO_RUNTIME_COMPILE_DEFINITIONS COMPILE_DEFINITIONS) include(CMakeDependentOption) -if("_GLIBCXX_USE_CXX11_ABI=0" IN_LIST OPENVINO_RUNTIME_COMPILE_DEFINITIONS) - set(USE_ABI0 ON CACHE BOOL "Set -D_GLIBCXX_USE_CXX11_ABI to 0 for fast_tokenizers") -endif() # # Compile flags @@ -97,44 +94,35 @@ endif() include(FetchContent) -if(NOT USE_ABI0) - # for ABI=0 case, we have to build from sources - find_package(sentencepiece QUIET) +FetchContent_Declare( +sentencepiece +URL https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz +URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886 +) +FetchContent_GetProperties(sentencepiece) +if(NOT sentencepiece_POPULATED) +if(DEFINED ENV{CONDA_BUILD_SYSROOT}) + set(openvino_installed_from_conda ON) + # OpenVINO conda package dynamically linked with external protobuf, + # and we need to link sentencepiece with external protobuf too. + set(CMAKE_FIND_PACKAGE_PREFER_CONFIG ON) + set(protobuf_MODULE_COMPATIBLE ON CACHE BOOL "Protobuf module compatible") endif() - -if(sentencepiece_FOUND) - find_package(absl REQUIRED) +if(openvino_installed_from_conda AND NOT WIN32) + set(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "") + set(SPM_PROTOBUF_PROVIDER "package" CACHE STRING "") + set(SPM_ABSL_PROVIDER "package" CACHE STRING "") else() - FetchContent_Declare( - sentencepiece - URL https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz - URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886 - ) - FetchContent_GetProperties(sentencepiece) - if(NOT sentencepiece_POPULATED) - if(DEFINED ENV{CONDA_BUILD_SYSROOT}) - set(openvino_installed_from_conda ON) - # OpenVINO conda package dynamically linked with external protobuf, - # and we need to link sentencepiece with external protobuf too. - set(CMAKE_FIND_PACKAGE_PREFER_CONFIG ON) - set(protobuf_MODULE_COMPATIBLE ON CACHE BOOL "Protobuf module compatible") - endif() - if(openvino_installed_from_conda AND NOT WIN32) - set(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "") - set(SPM_PROTOBUF_PROVIDER "package" CACHE STRING "") - set(SPM_ABSL_PROVIDER "package" CACHE STRING "") - else() - set(SPM_USE_BUILTIN_PROTOBUF ON CACHE BOOL "") - set(SPM_PROTOBUF_PROVIDER "internal" CACHE STRING "") - set(SPM_ABSL_PROVIDER "internal" CACHE STRING "") - endif() + set(SPM_USE_BUILTIN_PROTOBUF ON CACHE BOOL "") + set(SPM_PROTOBUF_PROVIDER "internal" CACHE STRING "") + set(SPM_ABSL_PROVIDER "internal" CACHE STRING "") +endif() - set(SPM_ENABLE_SHARED OFF CACHE BOOL "") - set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "") - set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile") - FetchContent_Populate(sentencepiece) - add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL) - endif() +set(SPM_ENABLE_SHARED OFF CACHE BOOL "") +set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "") +set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile") +FetchContent_Populate(sentencepiece) +add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL) endif() function(ov_tokenizers_link_sentencepiece TARGET_NAME) From 72b064633348d626e34ab5b10bacdc526fe4a1bf Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 10 Jan 2025 14:47:05 +0000 Subject: [PATCH 21/30] Delete FastTokenizer from Cmake --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 56cfbf5b..edfde65a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,6 @@ dependencies = [ [project.optional-dependencies] transformers = [ "transformers[sentencepiece] >= 4.36.0", - "jinja2", # has to be installed for sentencepiece-based transformers tokenizers to work "tiktoken" ] dev = [ @@ -48,6 +47,7 @@ dev = [ "pytest", "pytest_harvest", "pandas", + "jinja2", "openvino_tokenizers[transformers]" ] benchmark = [ From 4eb7dd065e1cd0f6cecbb015ca862efddb23fa06 Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Mon, 13 Jan 2025 16:34:54 +0100 Subject: [PATCH 22/30] use custom icu --- src/CMakeLists.txt | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0788fabb..ca4b8e04 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,6 +24,8 @@ if(POLICY CMP0169) cmake_policy(SET CMP0169 OLD) endif() +option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF) + function(ov_tokenizers_set_flags flags replace_value replace_pattern) foreach(flag ${flags}) if(${flag} MATCHES "${replace_pattern}") @@ -94,10 +96,32 @@ endif() include(FetchContent) +if (ENABLE_SYSTEM_ICU) + message(STATUS "Using system-installed ICU.") +else() + if(UNIX) + FetchContent_Declare( + ICU + URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Ubuntu-20.04-x64.tgz + URL_HASH SHA256=a8134e9f8a68d33600749601e143e553b5cb48c217c8941dbb9ef478fac420dd + ) + elseif(WIN32) + FetchContent_Declare( + ICU + URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Win64-MSVC2019.zip + URL_HASH SHA256=af6b585e49d90d39ae9d3fe298b7f56983931706a5e49d4bce675c6a499124e5 + ) + endif() + FetchContent_MakeAvailable(ICU) + set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local") + list(PREPEND CMAKE_PREFIX_PATH "${ICU_DIR}") + message(STATUS "Using prebuilt ICU from ${ICU_DIR}.") +endif() + FetchContent_Declare( -sentencepiece -URL https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz -URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886 + sentencepiece + URL https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz + URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886 ) FetchContent_GetProperties(sentencepiece) if(NOT sentencepiece_POPULATED) From cafaf0300cfd831ebfb7668f8f320352c65a9d82 Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Mon, 13 Jan 2025 17:12:55 +0100 Subject: [PATCH 23/30] filter supported targets --- src/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 743b07e6..f27d3c38 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,8 +24,6 @@ if(POLICY CMP0169) cmake_policy(SET CMP0169 OLD) endif() -option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF) - function(ov_tokenizers_set_flags flags replace_value replace_pattern) foreach(flag ${flags}) if(${flag} MATCHES "${replace_pattern}") @@ -96,7 +94,10 @@ endif() include(FetchContent) -if (ENABLE_SYSTEM_ICU) +option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF) + +# There are no prebuilt ICU packages for macOS and Linux arm64 +if (ENABLE_SYSTEM_ICU OR APPLE OR NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") message(STATUS "Using system-installed ICU.") else() if(UNIX) @@ -113,9 +114,8 @@ else() ) endif() FetchContent_MakeAvailable(ICU) - set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local") - list(PREPEND CMAKE_PREFIX_PATH "${ICU_DIR}") - message(STATUS "Using prebuilt ICU from ${ICU_DIR}.") + set(ICU_ROOT "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local") + message(STATUS "Using prebuilt ICU from ${ICU_ROOT}.") endif() FetchContent_Declare( From deb6873ec75954ae4e45a7a7899e05310518fdaf Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Mon, 13 Jan 2025 17:14:13 +0100 Subject: [PATCH 24/30] removed tmp solution --- .github/workflows/linux.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 3db68e17..22bf6a7b 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -108,7 +108,6 @@ jobs: - name: CMake configure - tokenizers run: | - apt-get update && apt install -y libicu-dev source ${INSTALL_DIR}/setupvars.sh cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -S ${{ env.OPENVINO_TOKENIZERS_REPO }} \ @@ -183,7 +182,6 @@ jobs: - name: Build tokenizers wheel run: | - apt-get update && apt install -y libicu-dev python -m pip wheel -v --no-deps --wheel-dir ${BUILD_DIR} \ --config-settings=override=cross.arch="manylinux_2_31_x86_64" \ ${{ needs.openvino_download.outputs.ov_wheel_source }} \ From 0e1365874baf8d88e30e6710f31f642f0534b2ef Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Mon, 13 Jan 2025 17:30:37 +0100 Subject: [PATCH 25/30] brew icu4c --- .github/workflows/mac.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 72a6b012..beae1a50 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -84,7 +84,7 @@ jobs: # - name: Install build dependencies - run: brew install coreutils ninja + run: brew install coreutils ninja icu4c - name: Setup Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 From ac21acded7cec9b1c26122e99ac57da83487eaf5 Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Tue, 14 Jan 2025 08:43:06 +0100 Subject: [PATCH 26/30] install icu4c --- .github/workflows/mac.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index beae1a50..2c3e9ffe 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -84,7 +84,7 @@ jobs: # - name: Install build dependencies - run: brew install coreutils ninja icu4c + run: brew install coreutils ninja - name: Setup Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 @@ -220,7 +220,7 @@ jobs: # Build # - name: Install build dependencies - run: brew install coreutils ninja + run: brew install coreutils ninja icu4c - name: CMake configure - tokenizers run: | @@ -312,7 +312,7 @@ jobs: # - name: Install build dependencies - run: brew install coreutils ninja + run: brew install coreutils ninja icu4c # # Build From a9c5b389e007d1d1bb5e72c042e3956618004d60 Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Tue, 14 Jan 2025 08:52:05 +0100 Subject: [PATCH 27/30] fixed arch detection --- src/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f27d3c38..4ffaa99f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -95,9 +95,9 @@ endif() include(FetchContent) option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF) - +set(SUPPORTED_ARCHS "X86;X86_64") # There are no prebuilt ICU packages for macOS and Linux arm64 -if (ENABLE_SYSTEM_ICU OR APPLE OR NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") +if (ENABLE_SYSTEM_ICU OR APPLE OR NOT OV_HOST_ARCH IN_LIST SUPPORTED_ARCHS) message(STATUS "Using system-installed ICU.") else() if(UNIX) From e3eb2fd209ac43c5af9eacbff34927a42986a526 Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Tue, 14 Jan 2025 09:28:40 +0100 Subject: [PATCH 28/30] fixed win subpath --- src/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4ffaa99f..173be66d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -106,16 +106,18 @@ else() URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Ubuntu-20.04-x64.tgz URL_HASH SHA256=a8134e9f8a68d33600749601e143e553b5cb48c217c8941dbb9ef478fac420dd ) + set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local") elseif(WIN32) FetchContent_Declare( ICU URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Win64-MSVC2019.zip URL_HASH SHA256=af6b585e49d90d39ae9d3fe298b7f56983931706a5e49d4bce675c6a499124e5 ) + set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src") endif() FetchContent_MakeAvailable(ICU) - set(ICU_ROOT "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local") - message(STATUS "Using prebuilt ICU from ${ICU_ROOT}.") + list(PREPEND CMAKE_PREFIX_PATH "${ICU_DIR}") + message(STATUS "Using prebuilt ICU from ${ICU_DIR}.") endif() FetchContent_Declare( From 60ec8e480f4c3286ada324ce0a9d71d388c770f7 Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Wed, 15 Jan 2025 17:00:24 +0100 Subject: [PATCH 29/30] build from sources --- cmake/external/icu.cmake | 80 ++++++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 27 +++----------- 2 files changed, 85 insertions(+), 22 deletions(-) create mode 100644 cmake/external/icu.cmake diff --git a/cmake/external/icu.cmake b/cmake/external/icu.cmake new file mode 100644 index 00000000..5b3ec1bd --- /dev/null +++ b/cmake/external/icu.cmake @@ -0,0 +1,80 @@ +include(FetchContent) + +set(THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/_deps/icu) +set(ICU_SOURCE_DIR ${THIRD_PARTY_PATH}/icu-src) +set(ICU_BINARY_DIR ${THIRD_PARTY_PATH}/icu-build) +SET(ICU_INSTALL_DIR ${THIRD_PARTY_PATH}/icu-install) + +set(HOST_ENV_CMAKE ${CMAKE_COMMAND} -E env + CC=${CMAKE_C_COMPILER} + CXX=${CMAKE_CXX_COMPILER} + CFLAGS=${CMAKE_C_FLAGS} + CXXFLAGS=${CMAKE_CXX_FLAGS} + LDFLAGS=${CMAKE_MODULE_LINKER_FLAGS} +) + +if(GENERATOR_IS_MULTI_CONFIG_VAR) + set(ICU_CONFIGURE_FLAGS $<$:"--enable-debug">$<$:"--enable-release">) + set(ICU_BUILD_TYPE $) +else() + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + set(ICU_CONFIGURE_FLAGS "--enable-debug") + else() + set(ICU_CONFIGURE_FLAGS "--enable-release") + endif() + set(ICU_BUILD_TYPE ${CMAKE_BUILD_TYPE}) +endif() + +set(FETCHCONTENT_QUIET FALSE) +# Fetch and build ICU +FetchContent_Declare( + ICU + URL https://github.com/unicode-org/icu/archive/refs/tags/release-70-1.tar.gz + URL_HASH SHA256=f30d670bdc03ba999638a2d2511952ab94adf204d0e14898666f2e0cacb7fef1 + SOURCE_DIR ${ICU_SOURCE_DIR} + BINARY_DIR ${ICU_BINARY_DIR} + DOWNLOAD_EXTRACT_TIMESTAMP TRUE +) + +FetchContent_MakeAvailable(ICU) + +if(NOT ICU_POPULATED) + # Configure the ICU build + message(STATUS "Configuring ICU...") + execute_process( + COMMAND ${ICU_SOURCE_DIR}/icu4c/source/runConfigureICU Linux --prefix ${ICU_INSTALL_DIR} ${ICU_CONFIGURE_FLAGS} + --disable-tests + --disable-samples + --disable-tools + --disable-extras + --disable-icuio + --disable-draft + WORKING_DIRECTORY ${ICU_BINARY_DIR} + ) + message(STATUS "Building ICU...") + execute_process( + COMMAND make -j${CMAKE_JOB_POOL_SIZE} + WORKING_DIRECTORY ${ICU_BINARY_DIR} + ) + message(STATUS "Installing ICU...") + execute_process( + COMMAND make install + WORKING_DIRECTORY ${ICU_BINARY_DIR} + ) +endif() +# Manually set ICU include and library directories +set(ICU_ROOT ${ICU_INSTALL_DIR}) + +if(WIN32) + set(SHARED_LIB_EXT "*.dll") +elseif(APPLE) + set(SHARED_LIB_EXT "*.dylib") +else() + set(SHARED_LIB_EXT "*.so") +endif() + +install( + DIRECTORY ${ICU_INSTALL_DIR}/lib/ + DESTINATION $ + FILES_MATCHING PATTERN "${SHARED_LIB_EXT}" +) \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 173be66d..e4c5d3b9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -95,29 +95,12 @@ endif() include(FetchContent) option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF) -set(SUPPORTED_ARCHS "X86;X86_64") -# There are no prebuilt ICU packages for macOS and Linux arm64 -if (ENABLE_SYSTEM_ICU OR APPLE OR NOT OV_HOST_ARCH IN_LIST SUPPORTED_ARCHS) + +if(ENABLE_SYSTEM_ICU) message(STATUS "Using system-installed ICU.") else() - if(UNIX) - FetchContent_Declare( - ICU - URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Ubuntu-20.04-x64.tgz - URL_HASH SHA256=a8134e9f8a68d33600749601e143e553b5cb48c217c8941dbb9ef478fac420dd - ) - set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local") - elseif(WIN32) - FetchContent_Declare( - ICU - URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Win64-MSVC2019.zip - URL_HASH SHA256=af6b585e49d90d39ae9d3fe298b7f56983931706a5e49d4bce675c6a499124e5 - ) - set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src") - endif() - FetchContent_MakeAvailable(ICU) - list(PREPEND CMAKE_PREFIX_PATH "${ICU_DIR}") - message(STATUS "Using prebuilt ICU from ${ICU_DIR}.") + message(STATUS "ICU not found, building from source...") + include(${CMAKE_SOURCE_DIR}/cmake/external/icu.cmake) endif() FetchContent_Declare( @@ -334,7 +317,7 @@ install(FILES "${openvino_tokenizers_SOURCE_DIR}/LICENSE" "${openvino_tokenizers_SOURCE_DIR}/README.md" DESTINATION "docs/openvino_tokenizers" COMPONENT openvino_tokenizers_docs) - + # # Cpack configuration # From d971aacc66190ac083b8421ad057a3603a453aaa Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Fri, 17 Jan 2025 15:20:49 +0100 Subject: [PATCH 30/30] test commit --- CMakeLists.txt | 34 ++++++++ src/CMakeLists.txt | 202 +++++++++++++++++++++++---------------------- 2 files changed, 137 insertions(+), 99 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cc277fd4..a5765b02 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,39 @@ else() set(BUILD_TYPE ${CMAKE_BUILD_TYPE}) endif() +# Put binaries at the top level for NPM package +if(CPACK_GENERATOR STREQUAL "NPM") + set(OPENVINO_TOKENIZERS_INSTALL_LIBDIR .) + set(OPENVINO_TOKENIZERS_INSTALL_BINDIR .) +else() + # - Windows: `\runtime\bin\intel64\Release\` + # - MacOS_x86: `/runtime/lib/intel64/Release` + # - MacOS_arm64: `/runtime/lib/arm64/Release/` + # - Linux_x86: `/runtime/lib/intel64/` + # - Linux_arm64: `/runtime/lib/aarch64/` + string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" OPENVINO_TOKENIZERS_INSTALL_DIR) + if(OPENVINO_TOKENIZERS_INSTALL_DIR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(OPENVINO_TOKENIZERS_INSTALL_DIR intel64) + elseif(OPENVINO_TOKENIZERS_INSTALL_DIR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)") + if(APPLE) + set(OPENVINO_TOKENIZERS_INSTALL_DIR "arm64") + else() + set(OPENVINO_TOKENIZERS_INSTALL_DIR "aarch64") + endif() + elseif(OPENVINO_TOKENIZERS_INSTALL_DIR STREQUAL "x86_64" OR OPENVINO_TOKENIZERS_INSTALL_DIR STREQUAL "amd64" # Windows detects Intel's 64-bit CPU as AMD64 + OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") + set(OPENVINO_TOKENIZERS_INSTALL_DIR intel64) + endif() + + if(WIN32 OR APPLE) + set(OPENVINO_TOKENIZERS_INSTALL_DIR ${OPENVINO_TOKENIZERS_INSTALL_DIR}/${BUILD_TYPE}) + endif() + + set(OPENVINO_TOKENIZERS_INSTALL_BINDIR "runtime/bin/${OPENVINO_TOKENIZERS_INSTALL_DIR}" CACHE STRING "Destination for files installation of bin files - Windows dll") + set(OPENVINO_TOKENIZERS_INSTALL_LIBDIR "runtime/lib/${OPENVINO_TOKENIZERS_INSTALL_DIR}" CACHE STRING "Destination for files installation of lib files") +endif() + + project(openvino_tokenizers VERSION 2025.0.0.0 DESCRIPTION "OpenVINO Tokenizers" @@ -78,6 +111,7 @@ if(BUILD_CPP_EXTENSION) endif() add_subdirectory(src) + endif() # install python files diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e4c5d3b9..2f401170 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -96,78 +96,109 @@ include(FetchContent) option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF) -if(ENABLE_SYSTEM_ICU) - message(STATUS "Using system-installed ICU.") -else() +# if(ENABLE_SYSTEM_ICU) +# message(STATUS "Using system-installed ICU.") +# else() message(STATUS "ICU not found, building from source...") - include(${CMAKE_SOURCE_DIR}/cmake/external/icu.cmake) -endif() - -FetchContent_Declare( - sentencepiece - URL https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz - URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886 -) -FetchContent_GetProperties(sentencepiece) -if(NOT sentencepiece_POPULATED) -if(DEFINED ENV{CONDA_BUILD_SYSROOT}) - set(openvino_installed_from_conda ON) - # OpenVINO conda package dynamically linked with external protobuf, - # and we need to link sentencepiece with external protobuf too. - set(CMAKE_FIND_PACKAGE_PREFER_CONFIG ON) - set(protobuf_MODULE_COMPATIBLE ON CACHE BOOL "Protobuf module compatible") -endif() -if(openvino_installed_from_conda AND NOT WIN32) - set(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "") - set(SPM_PROTOBUF_PROVIDER "package" CACHE STRING "") - set(SPM_ABSL_PROVIDER "package" CACHE STRING "") -else() - set(SPM_USE_BUILTIN_PROTOBUF ON CACHE BOOL "") - set(SPM_PROTOBUF_PROVIDER "internal" CACHE STRING "") - set(SPM_ABSL_PROVIDER "internal" CACHE STRING "") -endif() - -set(SPM_ENABLE_SHARED OFF CACHE BOOL "") -set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "") -set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile") -FetchContent_Populate(sentencepiece) -add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL) -endif() + # set(CMAKE_FIND_DEBUG_MODE ON) + # include(${CMAKE_SOURCE_DIR}/cmake/external/icu.cmake) + set(THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/_deps/icu) + set(ICU_SOURCE_DIR ${THIRD_PARTY_PATH}/icu-src CACHE PATH "Path to ICU source directory") + set(ICU_BUILD_DIR ${THIRD_PARTY_PATH}/icu-build CACHE PATH "Path to ICU build directory") + set(ICU_INSTALL_DIR ${THIRD_PARTY_PATH}/icu-install CACHE PATH "Path to ICU build directory") + + include(ExternalProject) + set_property(GLOBAL PROPERTY EP_STEP_TARGETS_VERBOSE ON) + # Add ICU as an external project + ExternalProject_Add( + icu_external + # GIT_REPOSITORY "https://github.com/unicode-org/icu.git" + # GIT_TAG "release-70-1" + URL https://github.com/unicode-org/icu/archive/refs/tags/release-70-1.tar.gz + URL_HASH SHA256=f30d670bdc03ba999638a2d2511952ab94adf204d0e14898666f2e0cacb7fef1 + PREFIX ${THIRD_PARTY_PATH} + SOURCE_DIR ${ICU_SOURCE_DIR} + BINARY_DIR ${ICU_BUILD_DIR} + INSTALL_DIR ${ICU_INSTALL_DIR} + CONFIGURE_COMMAND ${ICU_SOURCE_DIR}/icu4c/source/runConfigureICU Linux --prefix ${ICU_INSTALL_DIR} --disable-tests --disable-samples --disable-tools --disable-extras --disable-icuio --disable-draft --disable-icu-config + BUILD_COMMAND make -j${CMAKE_JOB_POOL_SIZE} + INSTALL_COMMAND make install + DOWNLOAD_EXTRACT_TIMESTAMP ON + ) + + list(PREPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/Modules") + find_package(ICU COMPONENTS i18n data uc REQUIRED) +# endif() + + include(${CMAKE_SOURCE_DIR}/cmake/external/sentencepiece.cmake) + +# FetchContent_Declare( +# sentencepiece +# URL https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz +# URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886 +# ) +# FetchContent_GetProperties(sentencepiece) +# if(NOT sentencepiece_POPULATED) +# if(DEFINED ENV{CONDA_BUILD_SYSROOT}) +# set(openvino_installed_from_conda ON) +# # OpenVINO conda package dynamically linked with external protobuf, +# # and we need to link sentencepiece with external protobuf too. +# set(CMAKE_FIND_PACKAGE_PREFER_CONFIG ON) +# set(protobuf_MODULE_COMPATIBLE ON CACHE BOOL "Protobuf module compatible") +# endif() +# if(openvino_installed_from_conda AND NOT WIN32) +# set(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "") +# set(SPM_PROTOBUF_PROVIDER "package" CACHE STRING "") +# set(SPM_ABSL_PROVIDER "package" CACHE STRING "") +# else() +# set(SPM_USE_BUILTIN_PROTOBUF ON CACHE BOOL "") +# set(SPM_PROTOBUF_PROVIDER "internal" CACHE STRING "") +# set(SPM_ABSL_PROVIDER "internal" CACHE STRING "") +# endif() + +# set(SPM_ENABLE_SHARED OFF CACHE BOOL "") +# set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "") +# set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile") + +# FetchContent_Populate(sentencepiece) +# add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL) +# endif() function(ov_tokenizers_link_sentencepiece TARGET_NAME) - if(sentencepiece_FOUND) - foreach(sp_target sentencepiece sentencepiece_train) - if(TARGET ${sp_target}-static) - # on Windows conda-forge builds sentencepiece as static library - target_link_libraries(${TARGET_NAME} PRIVATE ${sp_target}-static) - else() - target_link_libraries(${TARGET_NAME} PRIVATE ${sp_target}) - endif() - endforeach() - target_link_libraries(${TARGET_NAME} PRIVATE absl::string_view absl::flat_hash_set) - else() - target_include_directories(${TARGET_NAME} SYSTEM PRIVATE - "${sentencepiece_SOURCE_DIR}/src/builtin_pb" - "${sentencepiece_SOURCE_DIR}/src" - "${sentencepiece_SOURCE_DIR}/third_party/protobuf-lite" - "${sentencepiece_SOURCE_DIR}/third_party/" # for libabseil - "${sentencepiece_SOURCE_DIR}" - "${sentencepiece_BINARY_DIR}") - - foreach(sp_target sentencepiece-static sentencepiece_train-static) - if(CMAKE_CL_64) - target_compile_definitions(${sp_target} PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) - endif() - # to propogate _GLIBCXX_USE_CXX11_ABI value - target_compile_definitions(${sp_target} PUBLIC $) - target_link_libraries(${TARGET_NAME} PRIVATE ${sp_target}) - endforeach(sp_target sentencepiece sentencepiece_train) - - if(ANDROID) - # see https://github.com/protocolbuffers/protobuf/issues/2719#issuecomment-625400968 - target_link_libraries(${TARGET_NAME} PRIVATE log) + foreach(sp_target sentencepiece sentencepiece_train) + if(TARGET ${sp_target}-static) + # on Windows conda-forge builds sentencepiece as static library + target_link_libraries(${TARGET_NAME} PRIVATE ${sp_target}-static) + else() + target_link_libraries(${TARGET_NAME} INTERFACE ${sp_target}) endif() + endforeach() + target_link_libraries(${TARGET_NAME} INTERFACE absl::string_view absl::flat_hash_set) + + target_include_directories(${TARGET_NAME} SYSTEM PRIVATE + "${SPM_SOURCE_DIR}/src/builtin_pb" + "${SPM_SOURCE_DIR}/src" + "${SPM_SOURCE_DIR}/third_party/protobuf-lite" + "${SPM_SOURCE_DIR}/third_party/" # for libabseil + "${SPM_SOURCE_DIR}" + "${SPM_BINARY_DIR}") + + target_link_libraries(${TARGET_NAME} INTERFACE sentencepiece::sentencepiece) + + # foreach(sp_target sentencepiece-static sentencepiece_train-static) + # if(CMAKE_CL_64) + # target_compile_definitions(${sp_target} PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) + # endif() + # # to propogate _GLIBCXX_USE_CXX11_ABI value + # # target_compile_definitions(${sp_target} PUBLIC $) + # target_link_libraries(${TARGET_NAME} PRIVATE ${sp_target}) + # endforeach(sp_target sentencepiece sentencepiece_train) + + if(ANDROID) + # see https://github.com/protocolbuffers/protobuf/issues/2719#issuecomment-625400968 + target_link_libraries(${TARGET_NAME} PRIVATE log) endif() + endfunction() function(ov_tokenizers_build_static_re2) @@ -241,6 +272,11 @@ set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_OPTIONS "${extra_flags}" target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_OPENVINO_EXTENSION_API) target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime openvino::threading) +# Ensure ICU is built before your main project +add_dependencies(${TARGET_NAME} sentencepiece) +# Add ICU include and library directories to the target +target_link_libraries(${TARGET_NAME} INTERFACE ICU::i18n ICU::uc ICU::data) + # # Set install RPATH # @@ -266,38 +302,6 @@ endif() # Installation # -# Put binaries at the top level for NPM package -if(CPACK_GENERATOR STREQUAL "NPM") - set(OPENVINO_TOKENIZERS_INSTALL_LIBDIR .) - set(OPENVINO_TOKENIZERS_INSTALL_BINDIR .) -else() - # - Windows: `\runtime\bin\intel64\Release\` - # - MacOS_x86: `/runtime/lib/intel64/Release` - # - MacOS_arm64: `/runtime/lib/arm64/Release/` - # - Linux_x86: `/runtime/lib/intel64/` - # - Linux_arm64: `/runtime/lib/aarch64/` - string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" OPENVINO_TOKENIZERS_INSTALL_DIR) - if(OPENVINO_TOKENIZERS_INSTALL_DIR MATCHES "amd64.*|x86_64.*|AMD64.*") - set(OPENVINO_TOKENIZERS_INSTALL_DIR intel64) - elseif(OPENVINO_TOKENIZERS_INSTALL_DIR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)") - if(APPLE) - set(OPENVINO_TOKENIZERS_INSTALL_DIR "arm64") - else() - set(OPENVINO_TOKENIZERS_INSTALL_DIR "aarch64") - endif() - elseif(OPENVINO_TOKENIZERS_INSTALL_DIR STREQUAL "x86_64" OR OPENVINO_TOKENIZERS_INSTALL_DIR STREQUAL "amd64" # Windows detects Intel's 64-bit CPU as AMD64 - OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") - set(OPENVINO_TOKENIZERS_INSTALL_DIR intel64) - endif() - - if(WIN32 OR APPLE) - set(OPENVINO_TOKENIZERS_INSTALL_DIR ${OPENVINO_TOKENIZERS_INSTALL_DIR}/${BUILD_TYPE}) - endif() - - set(OPENVINO_TOKENIZERS_INSTALL_BINDIR "runtime/bin/${OPENVINO_TOKENIZERS_INSTALL_DIR}" CACHE STRING "Destination for files installation of bin files - Windows dll") - set(OPENVINO_TOKENIZERS_INSTALL_LIBDIR "runtime/lib/${OPENVINO_TOKENIZERS_INSTALL_DIR}" CACHE STRING "Destination for files installation of lib files") -endif() - # Installing the extension module install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION ${OPENVINO_TOKENIZERS_INSTALL_LIBDIR} COMPONENT openvino_tokenizers