From 1dd6b667a83a6c7e7c88f02cdbd595d1f9ac028c Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Tue, 26 Nov 2024 15:01:54 +0000
Subject: [PATCH 01/30] Add unicode normalization layer tests

---
 tests/layer_tests.py | 74 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 4 deletions(-)

diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index 17ed84f4..5ea8640e 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -2,10 +2,11 @@
 import re
 import tempfile
 from pathlib import Path
-from typing import Union
+from typing import Union, NamedTuple
 
 import openvino as ov
 import pytest
+import requests
 from openvino import Model, PartialShape, Type
 from openvino.runtime import op
 from openvino_tokenizers import _get_factory
@@ -19,12 +20,52 @@
     RegexSplitStep,
     TokenizerPipeline,
     UTF8ValidateStep,
+    NormalizeUnicode,
 )
 
 from tests.utils import get_hf_tokenizer
 
 
 core = ov.Core()
+UNICODE_TEST_FILE_URL = "https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt"
+
+
+class NormalizationTestLine(NamedTuple):
+    source: str
+    nfc: str
+    nfd: str
+    nfkc: str
+    nfkd: str
+    comment: str
+
+def parse_normalization_test_line(line):
+    parts, comment = line.split("#", 1)
+    parts = [part.strip() for part in parts.split(";")]
+
+    # Convert the hexadecimal Unicode code points to characters
+    def hex_to_char(hex_str):
+        return "".join(chr(int(code, 16)) for code in hex_str.split())
+
+    # Parse the components
+    source = hex_to_char(parts[0])
+    nfc = hex_to_char(parts[1])
+    nfd = hex_to_char(parts[2])
+    nfkc = hex_to_char(parts[3])
+    nfkd = hex_to_char(parts[4])
+
+    return NormalizationTestLine(source, nfc, nfd, nfkc, nfkd, comment)
+
+
+@pytest.fixture(scope="session")
+def unicode_normalization_test_data(request):
+    # check https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt for details
+    test_file = requests.get(UNICODE_TEST_FILE_URL).text
+    return [
+        parse_normalization_test_line(line)
+        for line in test_file.split("\n")
+        if line and not line.startswith("#") and not line.startswith("@")
+    ]
+
 
 ############################################
 ########## Test Normalizer Step ############
@@ -115,6 +156,31 @@ def precompiled_charsmap_json(request, hf_charsmap_tokenizer):
             return tj["normalizer"]["normalizers"][0]
 
 
+@pytest.mark.parametrize(
+    "test_parameters",
+    [
+        ("NFC", 19875, 90),
+        ("NFD", 19851, 114),
+        ("NFKC", 19777, 188),
+        ("NFKD", 19753, 212),
+    ]
+)
+def test_unicode_normalization_model(test_parameters, unicode_normalization_test_data):
+    normalization_type, positive_threshold, negative_threshold = test_parameters
+    nfc_normalizer_layer = NormalizeUnicode(normalization_type)
+    compiled_model = create_normalization_model(nfc_normalizer_layer)
+    negative = 0
+    positive = 0
+    for test_input in unicode_normalization_test_data:
+        res_ov = compiled_model([test_input.source])[0][0].encode()
+        expected = getattr(test_input, normalization_type.lower()).encode()
+        positive += res_ov == expected
+        negative += res_ov != expected
+
+    assert positive == positive_threshold
+    assert negative == negative_threshold
+
+
 @pytest.mark.parametrize("test_string", charsmap_test_strings)
 def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json):
     charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json)
@@ -140,7 +206,7 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
             RegexNormalizationStep(
                 regex_search_pattern=r" ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)",
                 replace_term=r"\1",
-            )
+            ),
         ),
         ("", "", RegexNormalizationStep.prepend_regex("▁")),
         ("\n", "▁\n", RegexNormalizationStep.prepend_regex("▁")),
@@ -152,9 +218,9 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
             RegexNormalizationStep(
                 regex_search_pattern=r"(^)(.)",
                 replace_term=r"▁\2",
-            )
+            ),
         ),
-    ]
+    ],
 )
 def test_regex_normalization(test_string, expected, layer):
     compiled_model = create_normalization_model(layer)

From afd4a60e8dd4858c72913d684e2525d978a27f38 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 29 Nov 2024 10:47:56 +0000
Subject: [PATCH 02/30] WiP

---
 .../openvino_tokenizers/tokenizer_pipeline.py | 19 ++++++++-
 src/CMakeLists.txt                            |  4 +-
 src/charsmap_normalization.cpp                | 42 +++++++++++++------
 src/charsmap_normalization.hpp                | 27 +++++++++++-
 tests/layer_tests.py                          | 19 ++++-----
 5 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
index 734a04d5..76102d1a 100644
--- a/python/openvino_tokenizers/tokenizer_pipeline.py
+++ b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -155,7 +155,7 @@ class NormalizationStep(BasePipelineStep):
 
 
 @dataclass
-class NormalizeUnicode(NormalizationStep):
+class _NormalizeUnicode(NormalizationStep):
     normalization_form: str = "NFD"
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
@@ -168,6 +168,23 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
             )
             .outputs()
         )
+    pass
+
+
+@dataclass
+class NormalizeUnicode(NormalizationStep):
+    normalization_form: str = "NFD"
+
+    def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
+        return (
+            _get_factory()
+            .create(
+                "CharsMapNormalization",
+                input_nodes,
+                {"normalization_form": self.normalization_form.lower()},
+            )
+            .outputs()
+        )
 
 
 @dataclass
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2caea5f4..11b11584 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -150,8 +150,8 @@ if(sentencepiece_FOUND)
 else()
   FetchContent_Declare(
     sentencepiece
-    URL      https://github.com/google/sentencepiece/archive/refs/tags/v0.2.0.tar.gz
-    URL_HASH SHA256=9970f0a0afee1648890293321665e5b2efa04eaec9f1671fcf8048f456f5bb86
+    URL      https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz
+    URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886
   )
   FetchContent_GetProperties(sentencepiece)
   if(NOT sentencepiece_POPULATED)
diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp
index d5ff9739..64cdc34b 100644
--- a/src/charsmap_normalization.cpp
+++ b/src/charsmap_normalization.cpp
@@ -11,8 +11,8 @@ using namespace ov;
 
 namespace {
 
-std::shared_ptr<sentencepiece::NormalizerSpec> make_identity_spec() {
-    auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec("identity");
+std::shared_ptr<sentencepiece::NormalizerSpec> make_normalization_spec(const std::string& normalization_form) {
+    auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec(normalization_form);
     return std::make_shared<sentencepiece::NormalizerSpec>(spec);
 }
 
@@ -21,35 +21,53 @@ std::shared_ptr<sentencepiece::NormalizerSpec> make_identity_spec() {
 
 void CharsMapNormalization::validate_and_infer_types() {
     auto input_size = get_input_size();
-    OPENVINO_ASSERT(input_size == 4 || input_size == 5, "supported input sizes are 4 or 5");
-
-    const bool has_skips = (input_size == 5);
+    bool has_skips;
+    if (m_normalization_form == "") {
+        OPENVINO_ASSERT(input_size == 4 || input_size == 5, "supported input sizes are 4 or 5 with input spec");
+        has_skips = (input_size == 5);
+        OPENVINO_ASSERT(get_input_element_type(3 + has_skips) == element::u8, "Charsmap normalizer accepts precompiled mapping and it should be of type u8 tensor");
+    } else {
+        OPENVINO_ASSERT(input_size == 3 || input_size == 4, "supported input sizes are 3 or 4 without input spec");
+        has_skips = (input_size == 4);
+    }
 
     check_string_input(this, 0);
-    OPENVINO_ASSERT(get_input_element_type(3 + has_skips) == element::u8, "Charsmap normalizer accepts precompiled mapping and it should be of type u8 tensor");
     set_string_output(this, 0, get_input_partial_shape(0));
 
     if (has_skips) {
         this->set_output_type(3, get_input_element_type(3),  get_input_partial_shape(3));
     };
+    std::cerr << "CharsMapNormalization validation done" << std::endl;
 }
 
 bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
-    const bool has_skips = (inputs.size() == 5);
-    {            
-        // Write to common trie structures should be protected to prevent race conditions.
+    std::cerr << "CharsMapNormalization evaluate" << std::endl;
+    const bool has_skips = (inputs.size() == 5) || (m_normalization_form != "" && inputs.size() == 4);
+    std::cerr << "has_skips: " << has_skips << std::endl;
+    {
         std::lock_guard<std::mutex> lock(m_mutex);
 
         if (m_normalizer == nullptr) {
-            const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
-            m_spec = make_identity_spec();
+            std::cerr << "CharsMapNormalization creating normalizer" << std::endl;
+            auto normalization_form = m_normalization_form == "" ? "identity" : m_normalization_form;
+
+            std::cerr << "normalization_form: " << normalization_form << std::endl;
+
+            m_spec = make_normalization_spec(normalization_form);
             m_spec->set_add_dummy_prefix(m_add_dummy_prefix);
             m_spec->set_escape_whitespaces(m_escape_whitespaces);
-            m_spec->set_precompiled_charsmap(precompiled_charsmap);
+
+            if (m_normalization_form == "") {
+                std::cerr << "CharsMapNormalization setting precompiled_charsmap" << std::endl;
+                const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
+                m_spec->set_precompiled_charsmap(precompiled_charsmap);
+            };
+
             m_normalizer = std::make_shared<sentencepiece::normalizer::Normalizer>(*m_spec);
         }
     }
 
+    std::cerr << "CharsMapNormalization evaluating normalization" << std::endl;
     return evaluate_normalization_helper(
         outputs,
         inputs,
diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp
index ef99d9c1..8ef869d0 100644
--- a/src/charsmap_normalization.hpp
+++ b/src/charsmap_normalization.hpp
@@ -27,18 +27,40 @@ class CharsMapNormalization : public ov::op::Op {
         const std::shared_ptr<sentencepiece::normalizer::Normalizer> normalizer,
         const std::shared_ptr<sentencepiece::NormalizerSpec> spec
     ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec) {
+        std::cerr << "CharsMapNormalization constructor" << std::endl;
+        constructor_validate_and_infer_types();
+    }
+    CharsMapNormalization(
+        const ov::OutputVector& arguments,
+        const std::shared_ptr<sentencepiece::normalizer::Normalizer> normalizer,
+        const std::shared_ptr<sentencepiece::NormalizerSpec> spec,
+        bool add_dummy_prefix = false,
+        bool escape_whitespaces = false,
+        const std::string& normalization_form = ""
+    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_normalization_form(normalization_form) {
+        std::cerr << "CharsMapNormalization constructor2" << std::endl;
+        constructor_validate_and_infer_types();
+    }
+    CharsMapNormalization(
+        const ov::OutputVector& arguments,
+        const std::shared_ptr<sentencepiece::normalizer::Normalizer> normalizer,
+        const std::shared_ptr<sentencepiece::NormalizerSpec> spec,
+        const std::string& normalization_form = ""
+    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_normalization_form(normalization_form) {
+        std::cerr << "CharsMapNormalization constructor3" << std::endl;
         constructor_validate_and_infer_types();
     }
 
     void validate_and_infer_types() override;
 
     std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
-        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec);
+        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_normalization_form);
     }
 
     bool visit_attributes(ov::AttributeVisitor& visitor) override {
         visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix);
         visitor.on_attribute("escape_whitespaces", m_escape_whitespaces);
+        visitor.on_attribute("normalization_form", m_normalization_form);
         return true;
     }
 
@@ -52,8 +74,9 @@ class CharsMapNormalization : public ov::op::Op {
 
     bool m_add_dummy_prefix = false;
     bool m_escape_whitespaces = false;
+    std::string m_normalization_form = "";
 
     // spec should be preserved for the lifetime of the normalizer
     mutable std::shared_ptr<sentencepiece::NormalizerSpec> m_spec;
     mutable std::mutex m_mutex;
-};
\ No newline at end of file
+};
diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index 5ea8640e..4765c849 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -74,7 +74,6 @@ def unicode_normalization_test_data(request):
 utf8_validate_strings = [
     # Valid sequences.
     b"Eng... test, string?!",
-    b"Eng... test, string?!",
     b"\xe2\x82\xac",  # Euro sign €ß
     "Проверка, как работает кириллица Љ љ Ђ ђ".encode(),
     "測試字符串".encode(),
@@ -156,6 +155,15 @@ def precompiled_charsmap_json(request, hf_charsmap_tokenizer):
             return tj["normalizer"]["normalizers"][0]
 
 
+@pytest.mark.parametrize("test_string", charsmap_test_strings)
+def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json):
+    charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json)
+    compiled_model = create_normalization_model(charsmap_normalization_node)
+    res_ov = compiled_model([test_string])[0][0]
+    res_hf = hf_charsmap_tokenizer.backend_tokenizer.normalizer.normalize_str(test_string)
+    assert res_ov == res_hf
+
+
 @pytest.mark.parametrize(
     "test_parameters",
     [
@@ -181,15 +189,6 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test
     assert negative == negative_threshold
 
 
-@pytest.mark.parametrize("test_string", charsmap_test_strings)
-def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json):
-    charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json)
-    compiled_model = create_normalization_model(charsmap_normalization_node)
-    res_ov = compiled_model([test_string])[0][0]
-    res_hf = hf_charsmap_tokenizer.backend_tokenizer.normalizer.normalize_str(test_string)
-    assert res_ov == res_hf
-
-
 @pytest.mark.parametrize(
     "test_string, expected, layer",
     [

From 2f24fec72c1a7b9741bd7a9f6fa718962971c4d8 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 20 Dec 2024 13:21:30 +0000
Subject: [PATCH 03/30] WiP

---
 src/CMakeLists.txt             |  1 +
 src/charsmap_normalization.cpp | 50 ++++++++++++++++++----------------
 src/charsmap_normalization.hpp | 15 ++++++----
 tests/layer_tests.py           | 49 +++++++++++++++++++++++----------
 4 files changed, 71 insertions(+), 44 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 11b11584..97aae478 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -174,6 +174,7 @@ else()
 
     set(SPM_ENABLE_SHARED OFF CACHE BOOL "")
     set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "")
+    set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile")
     FetchContent_Populate(sentencepiece)
     add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL)
   endif()
diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp
index 64cdc34b..1a6256e8 100644
--- a/src/charsmap_normalization.cpp
+++ b/src/charsmap_normalization.cpp
@@ -4,20 +4,11 @@
 
 #include "charsmap_normalization.hpp"
 #include "utils.hpp"
-#include "sentencepiece_trainer.h"  // for making normalizer spec
+#include "builder.h"  // for making normalizer spec
 #include "absl/strings/str_format.h"
 
 using namespace ov;
 
-namespace {
-
-std::shared_ptr<sentencepiece::NormalizerSpec> make_normalization_spec(const std::string& normalization_form) {
-    auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec(normalization_form);
-    return std::make_shared<sentencepiece::NormalizerSpec>(spec);
-}
-
-}  // namespace
-
 
 void CharsMapNormalization::validate_and_infer_types() {
     auto input_size = get_input_size();
@@ -37,37 +28,50 @@ void CharsMapNormalization::validate_and_infer_types() {
     if (has_skips) {
         this->set_output_type(3, get_input_element_type(3),  get_input_partial_shape(3));
     };
-    std::cerr << "CharsMapNormalization validation done" << std::endl;
 }
 
 bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
-    std::cerr << "CharsMapNormalization evaluate" << std::endl;
     const bool has_skips = (inputs.size() == 5) || (m_normalization_form != "" && inputs.size() == 4);
-    std::cerr << "has_skips: " << has_skips << std::endl;
     {
         std::lock_guard<std::mutex> lock(m_mutex);
 
         if (m_normalizer == nullptr) {
-            std::cerr << "CharsMapNormalization creating normalizer" << std::endl;
-            auto normalization_form = m_normalization_form == "" ? "identity" : m_normalization_form;
-
-            std::cerr << "normalization_form: " << normalization_form << std::endl;
-
-            m_spec = make_normalization_spec(normalization_form);
+            m_spec = std::make_shared<sentencepiece::NormalizerSpec>();
             m_spec->set_add_dummy_prefix(m_add_dummy_prefix);
             m_spec->set_escape_whitespaces(m_escape_whitespaces);
 
+            std::string precompiled_charsmap;
             if (m_normalization_form == "") {
-                std::cerr << "CharsMapNormalization setting precompiled_charsmap" << std::endl;
-                const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
-                m_spec->set_precompiled_charsmap(precompiled_charsmap);
+                precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
+            } else if (m_normalization_form == "nfc") {
+                sentencepiece::normalizer::Builder::CharsMap chars_map;
+                sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map);
+                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            } else if (m_normalization_form == "nfd") {
+                sentencepiece::normalizer::Builder::CharsMap chars_map;
+                sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map);
+                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            } else if (m_normalization_form == "nfkc") {
+                sentencepiece::normalizer::Builder::CharsMap chars_map;
+                sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map);
+                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            } else if (m_normalization_form == "nfkd") {
+                sentencepiece::normalizer::Builder::CharsMap chars_map;
+                sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map);
+                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            } else {
+                OPENVINO_ASSERT(false, "Unsupported normalization form: " + m_normalization_form);
             };
 
+            std::cerr << "CharsMapNormalization: precompiled_charsmap.size() = " << precompiled_charsmap.size() << std::endl;
+            std::cerr << "CharsMapNormalization: precompiled_charsmap first 100 chars = " << precompiled_charsmap.substr(0, 100) << std::endl;
+
+            m_spec->set_precompiled_charsmap(precompiled_charsmap);
+
             m_normalizer = std::make_shared<sentencepiece::normalizer::Normalizer>(*m_spec);
         }
     }
 
-    std::cerr << "CharsMapNormalization evaluating normalization" << std::endl;
     return evaluate_normalization_helper(
         outputs,
         inputs,
diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp
index 8ef869d0..43ab09d3 100644
--- a/src/charsmap_normalization.hpp
+++ b/src/charsmap_normalization.hpp
@@ -27,7 +27,6 @@ class CharsMapNormalization : public ov::op::Op {
         const std::shared_ptr<sentencepiece::normalizer::Normalizer> normalizer,
         const std::shared_ptr<sentencepiece::NormalizerSpec> spec
     ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec) {
-        std::cerr << "CharsMapNormalization constructor" << std::endl;
         constructor_validate_and_infer_types();
     }
     CharsMapNormalization(
@@ -36,9 +35,10 @@ class CharsMapNormalization : public ov::op::Op {
         const std::shared_ptr<sentencepiece::NormalizerSpec> spec,
         bool add_dummy_prefix = false,
         bool escape_whitespaces = false,
-        const std::string& normalization_form = ""
-    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_normalization_form(normalization_form) {
-        std::cerr << "CharsMapNormalization constructor2" << std::endl;
+        bool case_fold = false,
+        const std::string& normalization_form = "",
+        bool nmt = false
+    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){
         constructor_validate_and_infer_types();
     }
     CharsMapNormalization(
@@ -47,20 +47,21 @@ class CharsMapNormalization : public ov::op::Op {
         const std::shared_ptr<sentencepiece::NormalizerSpec> spec,
         const std::string& normalization_form = ""
     ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_normalization_form(normalization_form) {
-        std::cerr << "CharsMapNormalization constructor3" << std::endl;
         constructor_validate_and_infer_types();
     }
 
     void validate_and_infer_types() override;
 
     std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
-        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_normalization_form);
+        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt);
     }
 
     bool visit_attributes(ov::AttributeVisitor& visitor) override {
         visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix);
         visitor.on_attribute("escape_whitespaces", m_escape_whitespaces);
         visitor.on_attribute("normalization_form", m_normalization_form);
+        visitor.on_attribute("case_fold", m_case_fold);
+        visitor.on_attribute("nmt", m_nmt);
         return true;
     }
 
@@ -74,6 +75,8 @@ class CharsMapNormalization : public ov::op::Op {
 
     bool m_add_dummy_prefix = false;
     bool m_escape_whitespaces = false;
+    bool m_case_fold = false;
+    bool m_nmt = false;
     std::string m_normalization_form = "";
 
     // spec should be preserved for the lifetime of the normalizer
diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index 4765c849..9a271bc6 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -57,12 +57,16 @@ def hex_to_char(hex_str):
 
 
 @pytest.fixture(scope="session")
-def unicode_normalization_test_data(request):
+def icu_test_data(request):
+    return requests.get(UNICODE_TEST_FILE_URL).text
+
+
+@pytest.fixture(scope="session")
+def unicode_normalization_test_data(request, icu_test_data):
     # check https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt for details
-    test_file = requests.get(UNICODE_TEST_FILE_URL).text
     return [
         parse_normalization_test_line(line)
-        for line in test_file.split("\n")
+        for line in icu_test_data.split("\n")
         if line and not line.startswith("#") and not line.startswith("@")
     ]
 
@@ -167,26 +171,41 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
 @pytest.mark.parametrize(
     "test_parameters",
     [
-        ("NFC", 19875, 90),
-        ("NFD", 19851, 114),
-        ("NFKC", 19777, 188),
-        ("NFKD", 19753, 212),
+        # results for sentencepiece charsmap:
+        ("NFC", 17325),  # failed examples: 2640
+        ("NFD", 17736),  # failed examples: 2229
+        ("NFKC", 17159),  # failed examples: 2806
+        ("NFKD", 17554),  # failed examples: 2411
+        # results for icu70:
+        # ("NFC", 19875),  # failed examples: 90
+        # ("NFD", 19851),  # failed examples: 114
+        # ("NFKC", 19777),  # failed examples: 188
+        # ("NFKD", 19753),  # failed examples: 212
+        # results for huggingface tokenizers:
+        # ("NFC", 19247),  # failed examples: 718
+        # ("NFD", 19220),  # failed examples: 745
+        # ("NFKC", 19077),  # failed examples: 888
+        # ("NFKD", 19050),  # failed examples: 915
     ]
 )
 def test_unicode_normalization_model(test_parameters, unicode_normalization_test_data):
-    normalization_type, positive_threshold, negative_threshold = test_parameters
-    nfc_normalizer_layer = NormalizeUnicode(normalization_type)
-    compiled_model = create_normalization_model(nfc_normalizer_layer)
-    negative = 0
-    positive = 0
+    normalization_type, positive_threshold = test_parameters
+    normalizer_layer = NormalizeUnicode(normalization_type)
+    compiled_model = create_normalization_model(normalizer_layer)
+    positive, negative, no_transformation = 0, 0, 0
     for test_input in unicode_normalization_test_data:
         res_ov = compiled_model([test_input.source])[0][0].encode()
         expected = getattr(test_input, normalization_type.lower()).encode()
         positive += res_ov == expected
         negative += res_ov != expected
-
-    assert positive == positive_threshold
-    assert negative == negative_threshold
+        no_transformation += test_input.source.encode() == expected
+
+    assert positive == positive_threshold, (
+        f"{normalization_type}\n"
+        f"Positive: {positive}, expected: {positive_threshold}\n"
+        f"Negative: {negative}, expected: {len(unicode_normalization_test_data) - positive_threshold}\n"
+        f"No transformation: {no_transformation}, positive delta: {positive - no_transformation}"
+    )
 
 
 @pytest.mark.parametrize(

From 08052c2b20b32d20dcd6865713f77f4549d5236d Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Wed, 8 Jan 2025 20:03:15 +0000
Subject: [PATCH 04/30] Switch Casefold and UnicodeNormalization to CharsMap

---
 .../openvino_tokenizers/tokenizer_pipeline.py | 66 +++++++++++++------
 src/charsmap_normalization.cpp                | 26 ++++----
 src/charsmap_normalization.hpp                |  7 +-
 tests/layer_tests.py                          | 19 ++++++
 4 files changed, 83 insertions(+), 35 deletions(-)

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
index 76102d1a..c09ce09c 100644
--- a/python/openvino_tokenizers/tokenizer_pipeline.py
+++ b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -155,25 +155,15 @@ class NormalizationStep(BasePipelineStep):
 
 
 @dataclass
-class _NormalizeUnicode(NormalizationStep):
+class NormalizeUnicode(NormalizationStep):
     normalization_form: str = "NFD"
 
-    def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
-        return (
-            _get_factory()
-            .create(
-                "NormalizeUnicode",
-                input_nodes,
-                {"normalization_form": self.normalization_form},
+    def __post_init__(self):
+        if self.normalization_form not in ["NFD", "NFC", "NFKD", "NFKC"]:
+            raise ValueError(
+                'NormalizeUnicode`normalization_form` attribute must be one of ["NFD", "NFC", "NFKD", "NFKC"], '
+                f'got {self.normalization_form}.'
             )
-            .outputs()
-        )
-    pass
-
-
-@dataclass
-class NormalizeUnicode(NormalizationStep):
-    normalization_form: str = "NFD"
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         return (
@@ -181,7 +171,10 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
             .create(
                 "CharsMapNormalization",
                 input_nodes,
-                {"normalization_form": self.normalization_form.lower()},
+                {
+                    "normalization_form": self.normalization_form.lower(),
+                    "remove_extra_whitespaces": False,
+                },
             )
             .outputs()
         )
@@ -199,7 +192,19 @@ def __post_init__(self):
             )
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
-        return _get_factory().create("CaseFold", input_nodes, {"encoding": self.encoding}).outputs()
+        return (
+            _get_factory()
+            .create(
+                "CharsMapNormalization",
+                input_nodes,
+                {
+                    "normalization_form": "identity",
+                    "case_fold": True,
+                    "remove_extra_whitespaces": False,
+                },
+            )
+            .outputs()
+        )
 
 
 @dataclass
@@ -262,7 +267,17 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
 
 @dataclass
 class CharsmapStep(NormalizationStep):
-    charsmap: bytes
+    charsmap: Optional[bytes] = None
+    normalization_form: Optional[str] = None
+    add_dummy_prefix: bool = False
+    remove_extra_whitespaces: bool = True
+    escape_whitespaces: bool = False
+    case_fold: bool = False
+    nmt: bool = False
+
+    def __post_init__(self):
+        if self.charsmap is None and self.normalization_form is None:
+            raise ValueError("[ CharsmapStep ] `charsmap` or `normalization_form` attribute must be set")
 
     @classmethod
     def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep":
@@ -270,7 +285,18 @@ def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep":
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         input_nodes += make_constant_node(np.frombuffer(self.charsmap, dtype=np.uint8), dtype=Type.u8).outputs()
-        return _get_factory().create("CharsMapNormalization", input_nodes).outputs()
+        return _get_factory().create(
+            "CharsMapNormalization",
+            input_nodes,
+            {
+                "normalization_form": self.normalization_form or "",
+                "add_dummy_prefix": self.add_dummy_prefix,
+                "remove_extra_whitespaces": self.remove_extra_whitespaces,
+                "escape_whitespaces": self.escape_whitespaces,
+                "case_fold": self.case_fold,
+                "nmt": self.nmt,
+            }
+        ).outputs()
 
 
 @dataclass
diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp
index 1a6256e8..5178801a 100644
--- a/src/charsmap_normalization.cpp
+++ b/src/charsmap_normalization.cpp
@@ -38,34 +38,34 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor
         if (m_normalizer == nullptr) {
             m_spec = std::make_shared<sentencepiece::NormalizerSpec>();
             m_spec->set_add_dummy_prefix(m_add_dummy_prefix);
+            m_spec->set_remove_extra_whitespaces(m_remove_extra_whitespaces);
             m_spec->set_escape_whitespaces(m_escape_whitespaces);
 
-            std::string precompiled_charsmap;
-            if (m_normalization_form == "") {
-                precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
+            sentencepiece::normalizer::Builder::CharsMap chars_map;
+            if (m_normalization_form == "identity") {
+                // no need to modify chars_map
             } else if (m_normalization_form == "nfc") {
-                sentencepiece::normalizer::Builder::CharsMap chars_map;
                 sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map);
-                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
             } else if (m_normalization_form == "nfd") {
-                sentencepiece::normalizer::Builder::CharsMap chars_map;
                 sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map);
-                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
             } else if (m_normalization_form == "nfkc") {
-                sentencepiece::normalizer::Builder::CharsMap chars_map;
                 sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map);
-                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
             } else if (m_normalization_form == "nfkd") {
-                sentencepiece::normalizer::Builder::CharsMap chars_map;
                 sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map);
-                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
             } else {
                 OPENVINO_ASSERT(false, "Unsupported normalization form: " + m_normalization_form);
             };
 
-            std::cerr << "CharsMapNormalization: precompiled_charsmap.size() = " << precompiled_charsmap.size() << std::endl;
-            std::cerr << "CharsMapNormalization: precompiled_charsmap first 100 chars = " << precompiled_charsmap.substr(0, 100) << std::endl;
+            if (m_case_fold) {
+                sentencepiece::normalizer::Builder::MergeUnicodeCaseFoldMap(&chars_map);
+            };
 
+            std::string precompiled_charsmap;
+            if (m_normalization_form == "") {
+                precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
+            } else {
+                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            }
             m_spec->set_precompiled_charsmap(precompiled_charsmap);
 
             m_normalizer = std::make_shared<sentencepiece::normalizer::Normalizer>(*m_spec);
diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp
index 43ab09d3..a6179b63 100644
--- a/src/charsmap_normalization.hpp
+++ b/src/charsmap_normalization.hpp
@@ -34,11 +34,12 @@ class CharsMapNormalization : public ov::op::Op {
         const std::shared_ptr<sentencepiece::normalizer::Normalizer> normalizer,
         const std::shared_ptr<sentencepiece::NormalizerSpec> spec,
         bool add_dummy_prefix = false,
+        bool remove_extra_whitespaces = false,
         bool escape_whitespaces = false,
         bool case_fold = false,
         const std::string& normalization_form = "",
         bool nmt = false
-    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){
+    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_remove_extra_whitespaces(remove_extra_whitespaces), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){
         constructor_validate_and_infer_types();
     }
     CharsMapNormalization(
@@ -53,11 +54,12 @@ class CharsMapNormalization : public ov::op::Op {
     void validate_and_infer_types() override;
 
     std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
-        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt);
+        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_remove_extra_whitespaces, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt);
     }
 
     bool visit_attributes(ov::AttributeVisitor& visitor) override {
         visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix);
+        visitor.on_attribute("remove_extra_whitespaces", m_remove_extra_whitespaces);
         visitor.on_attribute("escape_whitespaces", m_escape_whitespaces);
         visitor.on_attribute("normalization_form", m_normalization_form);
         visitor.on_attribute("case_fold", m_case_fold);
@@ -74,6 +76,7 @@ class CharsMapNormalization : public ov::op::Op {
     mutable std::shared_ptr<sentencepiece::normalizer::Normalizer> m_normalizer;
 
     bool m_add_dummy_prefix = false;
+    bool m_remove_extra_whitespaces = true;
     bool m_escape_whitespaces = false;
     bool m_case_fold = false;
     bool m_nmt = false;
diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index 9a271bc6..7637b585 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -12,6 +12,7 @@
 from openvino_tokenizers import _get_factory
 from openvino_tokenizers.constants import UTF8ReplaceMode
 from openvino_tokenizers.tokenizer_pipeline import (
+    CaseFoldStep,
     CharsmapStep,
     DecodingStep,
     NormalizationStep,
@@ -208,6 +209,24 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test
     )
 
 
+
+@pytest.mark.parametrize(
+    "test_string, expected",
+    [
+        ("a", "a"),
+        ("A", "a"),
+        ("Ю", "ю"),
+        ("Σ", "σ"),
+        ("Hello World!", "hello world!"),
+    ]
+)
+def test_casefold_normalization(test_string, expected):
+    casefold = CaseFoldStep()
+    compiled_model = create_normalization_model(casefold)
+    res_ov = compiled_model([test_string])[0]
+    assert res_ov == expected
+
+
 @pytest.mark.parametrize(
     "test_string, expected, layer",
     [

From f6c001b1c2d524394bf74c1233fd9b37cd046da7 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Tue, 26 Nov 2024 15:01:54 +0000
Subject: [PATCH 05/30] Add unicode normalization layer tests

---
 tests/layer_tests.py | 72 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 3 deletions(-)

diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index f46b1be9..ddc5be85 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -2,10 +2,11 @@
 import re
 import tempfile
 from pathlib import Path
-from typing import Union
+from typing import Union, NamedTuple
 
 import openvino as ov
 import pytest
+import requests
 from openvino import Model, PartialShape, Type
 from openvino.runtime import op
 from openvino_tokenizers import _get_factory
@@ -19,12 +20,52 @@
     RegexSplitStep,
     TokenizerPipeline,
     UTF8ValidateStep,
+    NormalizeUnicode,
 )
 
 from tests.utils import get_hf_tokenizer
 
 
 core = ov.Core()
+UNICODE_TEST_FILE_URL = "https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt"
+
+
+class NormalizationTestLine(NamedTuple):
+    source: str
+    nfc: str
+    nfd: str
+    nfkc: str
+    nfkd: str
+    comment: str
+
+def parse_normalization_test_line(line):
+    parts, comment = line.split("#", 1)
+    parts = [part.strip() for part in parts.split(";")]
+
+    # Convert the hexadecimal Unicode code points to characters
+    def hex_to_char(hex_str):
+        return "".join(chr(int(code, 16)) for code in hex_str.split())
+
+    # Parse the components
+    source = hex_to_char(parts[0])
+    nfc = hex_to_char(parts[1])
+    nfd = hex_to_char(parts[2])
+    nfkc = hex_to_char(parts[3])
+    nfkd = hex_to_char(parts[4])
+
+    return NormalizationTestLine(source, nfc, nfd, nfkc, nfkd, comment)
+
+
+@pytest.fixture(scope="session")
+def unicode_normalization_test_data(request):
+    # check https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt for details
+    test_file = requests.get(UNICODE_TEST_FILE_URL).text
+    return [
+        parse_normalization_test_line(line)
+        for line in test_file.split("\n")
+        if line and not line.startswith("#") and not line.startswith("@")
+    ]
+
 
 ############################################
 ########## Test Normalizer Step ############
@@ -115,6 +156,31 @@ def precompiled_charsmap_json(request, hf_charsmap_tokenizer):
             return tj["normalizer"]["normalizers"][0]
 
 
+@pytest.mark.parametrize(
+    "test_parameters",
+    [
+        ("NFC", 19875, 90),
+        ("NFD", 19851, 114),
+        ("NFKC", 19777, 188),
+        ("NFKD", 19753, 212),
+    ]
+)
+def test_unicode_normalization_model(test_parameters, unicode_normalization_test_data):
+    normalization_type, positive_threshold, negative_threshold = test_parameters
+    nfc_normalizer_layer = NormalizeUnicode(normalization_type)
+    compiled_model = create_normalization_model(nfc_normalizer_layer)
+    negative = 0
+    positive = 0
+    for test_input in unicode_normalization_test_data:
+        res_ov = compiled_model([test_input.source])[0][0].encode()
+        expected = getattr(test_input, normalization_type.lower()).encode()
+        positive += res_ov == expected
+        negative += res_ov != expected
+
+    assert positive == positive_threshold
+    assert negative == negative_threshold
+
+
 @pytest.mark.parametrize("test_string", charsmap_test_strings)
 def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json):
     charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json)
@@ -140,7 +206,7 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
             RegexNormalizationStep(
                 regex_search_pattern=r" ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)",
                 replace_term=r"\1",
-            )
+            ),
         ),
         ("", "", RegexNormalizationStep.prepend_regex("▁")),
         ("\n", "▁\n", RegexNormalizationStep.prepend_regex("▁")),
@@ -152,7 +218,7 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
             RegexNormalizationStep(
                 regex_search_pattern=r"(^)(.)",
                 replace_term=r"▁\2",
-            )
+            ),
         ),
         (  # test backward compatibility with old regex
             "\n",

From 472b163a62b1003537fe007768490399aa4b364e Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 29 Nov 2024 10:47:56 +0000
Subject: [PATCH 06/30] WiP

---
 .../openvino_tokenizers/tokenizer_pipeline.py | 19 ++++++++-
 src/CMakeLists.txt                            |  4 +-
 src/charsmap_normalization.cpp                | 42 +++++++++++++------
 src/charsmap_normalization.hpp                | 27 +++++++++++-
 tests/layer_tests.py                          | 19 ++++-----
 5 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
index fa99f6b8..3e751491 100644
--- a/python/openvino_tokenizers/tokenizer_pipeline.py
+++ b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -155,7 +155,7 @@ class NormalizationStep(BasePipelineStep):
 
 
 @dataclass
-class NormalizeUnicode(NormalizationStep):
+class _NormalizeUnicode(NormalizationStep):
     normalization_form: str = "NFD"
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
@@ -168,6 +168,23 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
             )
             .outputs()
         )
+    pass
+
+
+@dataclass
+class NormalizeUnicode(NormalizationStep):
+    normalization_form: str = "NFD"
+
+    def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
+        return (
+            _get_factory()
+            .create(
+                "CharsMapNormalization",
+                input_nodes,
+                {"normalization_form": self.normalization_form.lower()},
+            )
+            .outputs()
+        )
 
 
 @dataclass
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2caea5f4..11b11584 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -150,8 +150,8 @@ if(sentencepiece_FOUND)
 else()
   FetchContent_Declare(
     sentencepiece
-    URL      https://github.com/google/sentencepiece/archive/refs/tags/v0.2.0.tar.gz
-    URL_HASH SHA256=9970f0a0afee1648890293321665e5b2efa04eaec9f1671fcf8048f456f5bb86
+    URL      https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz
+    URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886
   )
   FetchContent_GetProperties(sentencepiece)
   if(NOT sentencepiece_POPULATED)
diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp
index d5ff9739..64cdc34b 100644
--- a/src/charsmap_normalization.cpp
+++ b/src/charsmap_normalization.cpp
@@ -11,8 +11,8 @@ using namespace ov;
 
 namespace {
 
-std::shared_ptr<sentencepiece::NormalizerSpec> make_identity_spec() {
-    auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec("identity");
+std::shared_ptr<sentencepiece::NormalizerSpec> make_normalization_spec(const std::string& normalization_form) {
+    auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec(normalization_form);
     return std::make_shared<sentencepiece::NormalizerSpec>(spec);
 }
 
@@ -21,35 +21,53 @@ std::shared_ptr<sentencepiece::NormalizerSpec> make_identity_spec() {
 
 void CharsMapNormalization::validate_and_infer_types() {
     auto input_size = get_input_size();
-    OPENVINO_ASSERT(input_size == 4 || input_size == 5, "supported input sizes are 4 or 5");
-
-    const bool has_skips = (input_size == 5);
+    bool has_skips;
+    if (m_normalization_form == "") {
+        OPENVINO_ASSERT(input_size == 4 || input_size == 5, "supported input sizes are 4 or 5 with input spec");
+        has_skips = (input_size == 5);
+        OPENVINO_ASSERT(get_input_element_type(3 + has_skips) == element::u8, "Charsmap normalizer accepts precompiled mapping and it should be of type u8 tensor");
+    } else {
+        OPENVINO_ASSERT(input_size == 3 || input_size == 4, "supported input sizes are 3 or 4 without input spec");
+        has_skips = (input_size == 4);
+    }
 
     check_string_input(this, 0);
-    OPENVINO_ASSERT(get_input_element_type(3 + has_skips) == element::u8, "Charsmap normalizer accepts precompiled mapping and it should be of type u8 tensor");
     set_string_output(this, 0, get_input_partial_shape(0));
 
     if (has_skips) {
         this->set_output_type(3, get_input_element_type(3),  get_input_partial_shape(3));
     };
+    std::cerr << "CharsMapNormalization validation done" << std::endl;
 }
 
 bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
-    const bool has_skips = (inputs.size() == 5);
-    {            
-        // Write to common trie structures should be protected to prevent race conditions.
+    std::cerr << "CharsMapNormalization evaluate" << std::endl;
+    const bool has_skips = (inputs.size() == 5) || (m_normalization_form != "" && inputs.size() == 4);
+    std::cerr << "has_skips: " << has_skips << std::endl;
+    {
         std::lock_guard<std::mutex> lock(m_mutex);
 
         if (m_normalizer == nullptr) {
-            const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
-            m_spec = make_identity_spec();
+            std::cerr << "CharsMapNormalization creating normalizer" << std::endl;
+            auto normalization_form = m_normalization_form == "" ? "identity" : m_normalization_form;
+
+            std::cerr << "normalization_form: " << normalization_form << std::endl;
+
+            m_spec = make_normalization_spec(normalization_form);
             m_spec->set_add_dummy_prefix(m_add_dummy_prefix);
             m_spec->set_escape_whitespaces(m_escape_whitespaces);
-            m_spec->set_precompiled_charsmap(precompiled_charsmap);
+
+            if (m_normalization_form == "") {
+                std::cerr << "CharsMapNormalization setting precompiled_charsmap" << std::endl;
+                const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
+                m_spec->set_precompiled_charsmap(precompiled_charsmap);
+            };
+
             m_normalizer = std::make_shared<sentencepiece::normalizer::Normalizer>(*m_spec);
         }
     }
 
+    std::cerr << "CharsMapNormalization evaluating normalization" << std::endl;
     return evaluate_normalization_helper(
         outputs,
         inputs,
diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp
index ef99d9c1..8ef869d0 100644
--- a/src/charsmap_normalization.hpp
+++ b/src/charsmap_normalization.hpp
@@ -27,18 +27,40 @@ class CharsMapNormalization : public ov::op::Op {
         const std::shared_ptr<sentencepiece::normalizer::Normalizer> normalizer,
         const std::shared_ptr<sentencepiece::NormalizerSpec> spec
     ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec) {
+        std::cerr << "CharsMapNormalization constructor" << std::endl;
+        constructor_validate_and_infer_types();
+    }
+    CharsMapNormalization(
+        const ov::OutputVector& arguments,
+        const std::shared_ptr<sentencepiece::normalizer::Normalizer> normalizer,
+        const std::shared_ptr<sentencepiece::NormalizerSpec> spec,
+        bool add_dummy_prefix = false,
+        bool escape_whitespaces = false,
+        const std::string& normalization_form = ""
+    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_normalization_form(normalization_form) {
+        std::cerr << "CharsMapNormalization constructor2" << std::endl;
+        constructor_validate_and_infer_types();
+    }
+    CharsMapNormalization(
+        const ov::OutputVector& arguments,
+        const std::shared_ptr<sentencepiece::normalizer::Normalizer> normalizer,
+        const std::shared_ptr<sentencepiece::NormalizerSpec> spec,
+        const std::string& normalization_form = ""
+    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_normalization_form(normalization_form) {
+        std::cerr << "CharsMapNormalization constructor3" << std::endl;
         constructor_validate_and_infer_types();
     }
 
     void validate_and_infer_types() override;
 
     std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
-        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec);
+        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_normalization_form);
     }
 
     bool visit_attributes(ov::AttributeVisitor& visitor) override {
         visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix);
         visitor.on_attribute("escape_whitespaces", m_escape_whitespaces);
+        visitor.on_attribute("normalization_form", m_normalization_form);
         return true;
     }
 
@@ -52,8 +74,9 @@ class CharsMapNormalization : public ov::op::Op {
 
     bool m_add_dummy_prefix = false;
     bool m_escape_whitespaces = false;
+    std::string m_normalization_form = "";
 
     // spec should be preserved for the lifetime of the normalizer
     mutable std::shared_ptr<sentencepiece::NormalizerSpec> m_spec;
     mutable std::mutex m_mutex;
-};
\ No newline at end of file
+};
diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index ddc5be85..f5d8fca9 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -74,7 +74,6 @@ def unicode_normalization_test_data(request):
 utf8_validate_strings = [
     # Valid sequences.
     b"Eng... test, string?!",
-    b"Eng... test, string?!",
     b"\xe2\x82\xac",  # Euro sign €ß
     "Проверка, как работает кириллица Љ љ Ђ ђ".encode(),
     "測試字符串".encode(),
@@ -156,6 +155,15 @@ def precompiled_charsmap_json(request, hf_charsmap_tokenizer):
             return tj["normalizer"]["normalizers"][0]
 
 
+@pytest.mark.parametrize("test_string", charsmap_test_strings)
+def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json):
+    charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json)
+    compiled_model = create_normalization_model(charsmap_normalization_node)
+    res_ov = compiled_model([test_string])[0][0]
+    res_hf = hf_charsmap_tokenizer.backend_tokenizer.normalizer.normalize_str(test_string)
+    assert res_ov == res_hf
+
+
 @pytest.mark.parametrize(
     "test_parameters",
     [
@@ -181,15 +189,6 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test
     assert negative == negative_threshold
 
 
-@pytest.mark.parametrize("test_string", charsmap_test_strings)
-def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled_charsmap_json):
-    charsmap_normalization_node = CharsmapStep.from_hf_step_json(precompiled_charsmap_json)
-    compiled_model = create_normalization_model(charsmap_normalization_node)
-    res_ov = compiled_model([test_string])[0][0]
-    res_hf = hf_charsmap_tokenizer.backend_tokenizer.normalizer.normalize_str(test_string)
-    assert res_ov == res_hf
-
-
 @pytest.mark.parametrize(
     "test_string, expected, layer",
     [

From 04fb20c5329ddc07cab6d44ee491e3d4dd4fb7c6 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 20 Dec 2024 13:21:30 +0000
Subject: [PATCH 07/30] WiP

---
 src/CMakeLists.txt             |  1 +
 src/charsmap_normalization.cpp | 50 ++++++++++++++++++----------------
 src/charsmap_normalization.hpp | 15 ++++++----
 tests/layer_tests.py           | 49 +++++++++++++++++++++++----------
 4 files changed, 71 insertions(+), 44 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 11b11584..97aae478 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -174,6 +174,7 @@ else()
 
     set(SPM_ENABLE_SHARED OFF CACHE BOOL "")
     set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "")
+    set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile")
     FetchContent_Populate(sentencepiece)
     add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL)
   endif()
diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp
index 64cdc34b..1a6256e8 100644
--- a/src/charsmap_normalization.cpp
+++ b/src/charsmap_normalization.cpp
@@ -4,20 +4,11 @@
 
 #include "charsmap_normalization.hpp"
 #include "utils.hpp"
-#include "sentencepiece_trainer.h"  // for making normalizer spec
+#include "builder.h"  // for making normalizer spec
 #include "absl/strings/str_format.h"
 
 using namespace ov;
 
-namespace {
-
-std::shared_ptr<sentencepiece::NormalizerSpec> make_normalization_spec(const std::string& normalization_form) {
-    auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec(normalization_form);
-    return std::make_shared<sentencepiece::NormalizerSpec>(spec);
-}
-
-}  // namespace
-
 
 void CharsMapNormalization::validate_and_infer_types() {
     auto input_size = get_input_size();
@@ -37,37 +28,50 @@ void CharsMapNormalization::validate_and_infer_types() {
     if (has_skips) {
         this->set_output_type(3, get_input_element_type(3),  get_input_partial_shape(3));
     };
-    std::cerr << "CharsMapNormalization validation done" << std::endl;
 }
 
 bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
-    std::cerr << "CharsMapNormalization evaluate" << std::endl;
     const bool has_skips = (inputs.size() == 5) || (m_normalization_form != "" && inputs.size() == 4);
-    std::cerr << "has_skips: " << has_skips << std::endl;
     {
         std::lock_guard<std::mutex> lock(m_mutex);
 
         if (m_normalizer == nullptr) {
-            std::cerr << "CharsMapNormalization creating normalizer" << std::endl;
-            auto normalization_form = m_normalization_form == "" ? "identity" : m_normalization_form;
-
-            std::cerr << "normalization_form: " << normalization_form << std::endl;
-
-            m_spec = make_normalization_spec(normalization_form);
+            m_spec = std::make_shared<sentencepiece::NormalizerSpec>();
             m_spec->set_add_dummy_prefix(m_add_dummy_prefix);
             m_spec->set_escape_whitespaces(m_escape_whitespaces);
 
+            std::string precompiled_charsmap;
             if (m_normalization_form == "") {
-                std::cerr << "CharsMapNormalization setting precompiled_charsmap" << std::endl;
-                const std::string precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
-                m_spec->set_precompiled_charsmap(precompiled_charsmap);
+                precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
+            } else if (m_normalization_form == "nfc") {
+                sentencepiece::normalizer::Builder::CharsMap chars_map;
+                sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map);
+                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            } else if (m_normalization_form == "nfd") {
+                sentencepiece::normalizer::Builder::CharsMap chars_map;
+                sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map);
+                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            } else if (m_normalization_form == "nfkc") {
+                sentencepiece::normalizer::Builder::CharsMap chars_map;
+                sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map);
+                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            } else if (m_normalization_form == "nfkd") {
+                sentencepiece::normalizer::Builder::CharsMap chars_map;
+                sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map);
+                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            } else {
+                OPENVINO_ASSERT(false, "Unsupported normalization form: " + m_normalization_form);
             };
 
+            std::cerr << "CharsMapNormalization: precompiled_charsmap.size() = " << precompiled_charsmap.size() << std::endl;
+            std::cerr << "CharsMapNormalization: precompiled_charsmap first 100 chars = " << precompiled_charsmap.substr(0, 100) << std::endl;
+
+            m_spec->set_precompiled_charsmap(precompiled_charsmap);
+
             m_normalizer = std::make_shared<sentencepiece::normalizer::Normalizer>(*m_spec);
         }
     }
 
-    std::cerr << "CharsMapNormalization evaluating normalization" << std::endl;
     return evaluate_normalization_helper(
         outputs,
         inputs,
diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp
index 8ef869d0..43ab09d3 100644
--- a/src/charsmap_normalization.hpp
+++ b/src/charsmap_normalization.hpp
@@ -27,7 +27,6 @@ class CharsMapNormalization : public ov::op::Op {
         const std::shared_ptr<sentencepiece::normalizer::Normalizer> normalizer,
         const std::shared_ptr<sentencepiece::NormalizerSpec> spec
     ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec) {
-        std::cerr << "CharsMapNormalization constructor" << std::endl;
         constructor_validate_and_infer_types();
     }
     CharsMapNormalization(
@@ -36,9 +35,10 @@ class CharsMapNormalization : public ov::op::Op {
         const std::shared_ptr<sentencepiece::NormalizerSpec> spec,
         bool add_dummy_prefix = false,
         bool escape_whitespaces = false,
-        const std::string& normalization_form = ""
-    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_normalization_form(normalization_form) {
-        std::cerr << "CharsMapNormalization constructor2" << std::endl;
+        bool case_fold = false,
+        const std::string& normalization_form = "",
+        bool nmt = false
+    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){
         constructor_validate_and_infer_types();
     }
     CharsMapNormalization(
@@ -47,20 +47,21 @@ class CharsMapNormalization : public ov::op::Op {
         const std::shared_ptr<sentencepiece::NormalizerSpec> spec,
         const std::string& normalization_form = ""
     ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_normalization_form(normalization_form) {
-        std::cerr << "CharsMapNormalization constructor3" << std::endl;
         constructor_validate_and_infer_types();
     }
 
     void validate_and_infer_types() override;
 
     std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
-        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_normalization_form);
+        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt);
     }
 
     bool visit_attributes(ov::AttributeVisitor& visitor) override {
         visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix);
         visitor.on_attribute("escape_whitespaces", m_escape_whitespaces);
         visitor.on_attribute("normalization_form", m_normalization_form);
+        visitor.on_attribute("case_fold", m_case_fold);
+        visitor.on_attribute("nmt", m_nmt);
         return true;
     }
 
@@ -74,6 +75,8 @@ class CharsMapNormalization : public ov::op::Op {
 
     bool m_add_dummy_prefix = false;
     bool m_escape_whitespaces = false;
+    bool m_case_fold = false;
+    bool m_nmt = false;
     std::string m_normalization_form = "";
 
     // spec should be preserved for the lifetime of the normalizer
diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index f5d8fca9..59d93c7e 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -57,12 +57,16 @@ def hex_to_char(hex_str):
 
 
 @pytest.fixture(scope="session")
-def unicode_normalization_test_data(request):
+def icu_test_data(request):
+    return requests.get(UNICODE_TEST_FILE_URL).text
+
+
+@pytest.fixture(scope="session")
+def unicode_normalization_test_data(request, icu_test_data):
     # check https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt for details
-    test_file = requests.get(UNICODE_TEST_FILE_URL).text
     return [
         parse_normalization_test_line(line)
-        for line in test_file.split("\n")
+        for line in icu_test_data.split("\n")
         if line and not line.startswith("#") and not line.startswith("@")
     ]
 
@@ -167,26 +171,41 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
 @pytest.mark.parametrize(
     "test_parameters",
     [
-        ("NFC", 19875, 90),
-        ("NFD", 19851, 114),
-        ("NFKC", 19777, 188),
-        ("NFKD", 19753, 212),
+        # results for sentencepiece charsmap:
+        ("NFC", 17325),  # failed examples: 2640
+        ("NFD", 17736),  # failed examples: 2229
+        ("NFKC", 17159),  # failed examples: 2806
+        ("NFKD", 17554),  # failed examples: 2411
+        # results for icu70:
+        # ("NFC", 19875),  # failed examples: 90
+        # ("NFD", 19851),  # failed examples: 114
+        # ("NFKC", 19777),  # failed examples: 188
+        # ("NFKD", 19753),  # failed examples: 212
+        # results for huggingface tokenizers:
+        # ("NFC", 19247),  # failed examples: 718
+        # ("NFD", 19220),  # failed examples: 745
+        # ("NFKC", 19077),  # failed examples: 888
+        # ("NFKD", 19050),  # failed examples: 915
     ]
 )
 def test_unicode_normalization_model(test_parameters, unicode_normalization_test_data):
-    normalization_type, positive_threshold, negative_threshold = test_parameters
-    nfc_normalizer_layer = NormalizeUnicode(normalization_type)
-    compiled_model = create_normalization_model(nfc_normalizer_layer)
-    negative = 0
-    positive = 0
+    normalization_type, positive_threshold = test_parameters
+    normalizer_layer = NormalizeUnicode(normalization_type)
+    compiled_model = create_normalization_model(normalizer_layer)
+    positive, negative, no_transformation = 0, 0, 0
     for test_input in unicode_normalization_test_data:
         res_ov = compiled_model([test_input.source])[0][0].encode()
         expected = getattr(test_input, normalization_type.lower()).encode()
         positive += res_ov == expected
         negative += res_ov != expected
-
-    assert positive == positive_threshold
-    assert negative == negative_threshold
+        no_transformation += test_input.source.encode() == expected
+
+    assert positive == positive_threshold, (
+        f"{normalization_type}\n"
+        f"Positive: {positive}, expected: {positive_threshold}\n"
+        f"Negative: {negative}, expected: {len(unicode_normalization_test_data) - positive_threshold}\n"
+        f"No transformation: {no_transformation}, positive delta: {positive - no_transformation}"
+    )
 
 
 @pytest.mark.parametrize(

From ed1203f2183dfdcb13855694755b1ca62da4f6e4 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Wed, 8 Jan 2025 20:03:15 +0000
Subject: [PATCH 08/30] Switch Casefold and UnicodeNormalization to CharsMap

---
 .../openvino_tokenizers/tokenizer_pipeline.py | 66 +++++++++++++------
 src/charsmap_normalization.cpp                | 26 ++++----
 src/charsmap_normalization.hpp                |  7 +-
 tests/layer_tests.py                          | 19 ++++++
 4 files changed, 83 insertions(+), 35 deletions(-)

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
index 3e751491..66742004 100644
--- a/python/openvino_tokenizers/tokenizer_pipeline.py
+++ b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -155,25 +155,15 @@ class NormalizationStep(BasePipelineStep):
 
 
 @dataclass
-class _NormalizeUnicode(NormalizationStep):
+class NormalizeUnicode(NormalizationStep):
     normalization_form: str = "NFD"
 
-    def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
-        return (
-            _get_factory()
-            .create(
-                "NormalizeUnicode",
-                input_nodes,
-                {"normalization_form": self.normalization_form},
+    def __post_init__(self):
+        if self.normalization_form not in ["NFD", "NFC", "NFKD", "NFKC"]:
+            raise ValueError(
+                'NormalizeUnicode`normalization_form` attribute must be one of ["NFD", "NFC", "NFKD", "NFKC"], '
+                f'got {self.normalization_form}.'
             )
-            .outputs()
-        )
-    pass
-
-
-@dataclass
-class NormalizeUnicode(NormalizationStep):
-    normalization_form: str = "NFD"
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         return (
@@ -181,7 +171,10 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
             .create(
                 "CharsMapNormalization",
                 input_nodes,
-                {"normalization_form": self.normalization_form.lower()},
+                {
+                    "normalization_form": self.normalization_form.lower(),
+                    "remove_extra_whitespaces": False,
+                },
             )
             .outputs()
         )
@@ -199,7 +192,19 @@ def __post_init__(self):
             )
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
-        return _get_factory().create("CaseFold", input_nodes, {"encoding": self.encoding}).outputs()
+        return (
+            _get_factory()
+            .create(
+                "CharsMapNormalization",
+                input_nodes,
+                {
+                    "normalization_form": "identity",
+                    "case_fold": True,
+                    "remove_extra_whitespaces": False,
+                },
+            )
+            .outputs()
+        )
 
 
 @dataclass
@@ -262,7 +267,17 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
 
 @dataclass
 class CharsmapStep(NormalizationStep):
-    charsmap: bytes
+    charsmap: Optional[bytes] = None
+    normalization_form: Optional[str] = None
+    add_dummy_prefix: bool = False
+    remove_extra_whitespaces: bool = True
+    escape_whitespaces: bool = False
+    case_fold: bool = False
+    nmt: bool = False
+
+    def __post_init__(self):
+        if self.charsmap is None and self.normalization_form is None:
+            raise ValueError("[ CharsmapStep ] `charsmap` or `normalization_form` attribute must be set")
 
     @classmethod
     def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep":
@@ -270,7 +285,18 @@ def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep":
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         input_nodes += make_constant_node(np.frombuffer(self.charsmap, dtype=np.uint8), dtype=Type.u8).outputs()
-        return _get_factory().create("CharsMapNormalization", input_nodes).outputs()
+        return _get_factory().create(
+            "CharsMapNormalization",
+            input_nodes,
+            {
+                "normalization_form": self.normalization_form or "",
+                "add_dummy_prefix": self.add_dummy_prefix,
+                "remove_extra_whitespaces": self.remove_extra_whitespaces,
+                "escape_whitespaces": self.escape_whitespaces,
+                "case_fold": self.case_fold,
+                "nmt": self.nmt,
+            }
+        ).outputs()
 
 
 @dataclass
diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp
index 1a6256e8..5178801a 100644
--- a/src/charsmap_normalization.cpp
+++ b/src/charsmap_normalization.cpp
@@ -38,34 +38,34 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor
         if (m_normalizer == nullptr) {
             m_spec = std::make_shared<sentencepiece::NormalizerSpec>();
             m_spec->set_add_dummy_prefix(m_add_dummy_prefix);
+            m_spec->set_remove_extra_whitespaces(m_remove_extra_whitespaces);
             m_spec->set_escape_whitespaces(m_escape_whitespaces);
 
-            std::string precompiled_charsmap;
-            if (m_normalization_form == "") {
-                precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
+            sentencepiece::normalizer::Builder::CharsMap chars_map;
+            if (m_normalization_form == "identity") {
+                // no need to modify chars_map
             } else if (m_normalization_form == "nfc") {
-                sentencepiece::normalizer::Builder::CharsMap chars_map;
                 sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map);
-                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
             } else if (m_normalization_form == "nfd") {
-                sentencepiece::normalizer::Builder::CharsMap chars_map;
                 sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map);
-                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
             } else if (m_normalization_form == "nfkc") {
-                sentencepiece::normalizer::Builder::CharsMap chars_map;
                 sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map);
-                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
             } else if (m_normalization_form == "nfkd") {
-                sentencepiece::normalizer::Builder::CharsMap chars_map;
                 sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map);
-                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
             } else {
                 OPENVINO_ASSERT(false, "Unsupported normalization form: " + m_normalization_form);
             };
 
-            std::cerr << "CharsMapNormalization: precompiled_charsmap.size() = " << precompiled_charsmap.size() << std::endl;
-            std::cerr << "CharsMapNormalization: precompiled_charsmap first 100 chars = " << precompiled_charsmap.substr(0, 100) << std::endl;
+            if (m_case_fold) {
+                sentencepiece::normalizer::Builder::MergeUnicodeCaseFoldMap(&chars_map);
+            };
 
+            std::string precompiled_charsmap;
+            if (m_normalization_form == "") {
+                precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
+            } else {
+                sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            }
             m_spec->set_precompiled_charsmap(precompiled_charsmap);
 
             m_normalizer = std::make_shared<sentencepiece::normalizer::Normalizer>(*m_spec);
diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp
index 43ab09d3..a6179b63 100644
--- a/src/charsmap_normalization.hpp
+++ b/src/charsmap_normalization.hpp
@@ -34,11 +34,12 @@ class CharsMapNormalization : public ov::op::Op {
         const std::shared_ptr<sentencepiece::normalizer::Normalizer> normalizer,
         const std::shared_ptr<sentencepiece::NormalizerSpec> spec,
         bool add_dummy_prefix = false,
+        bool remove_extra_whitespaces = false,
         bool escape_whitespaces = false,
         bool case_fold = false,
         const std::string& normalization_form = "",
         bool nmt = false
-    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){
+    ): ov::op::Op(arguments), m_normalizer(normalizer), m_spec(spec), m_add_dummy_prefix(add_dummy_prefix), m_remove_extra_whitespaces(remove_extra_whitespaces), m_escape_whitespaces(escape_whitespaces), m_case_fold(case_fold), m_normalization_form(normalization_form), m_nmt(nmt){
         constructor_validate_and_infer_types();
     }
     CharsMapNormalization(
@@ -53,11 +54,12 @@ class CharsMapNormalization : public ov::op::Op {
     void validate_and_infer_types() override;
 
     std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
-        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt);
+        return std::make_shared<CharsMapNormalization>(inputs, m_normalizer, m_spec, m_add_dummy_prefix, m_remove_extra_whitespaces, m_escape_whitespaces, m_case_fold, m_normalization_form, m_nmt);
     }
 
     bool visit_attributes(ov::AttributeVisitor& visitor) override {
         visitor.on_attribute("add_dummy_prefix", m_add_dummy_prefix);
+        visitor.on_attribute("remove_extra_whitespaces", m_remove_extra_whitespaces);
         visitor.on_attribute("escape_whitespaces", m_escape_whitespaces);
         visitor.on_attribute("normalization_form", m_normalization_form);
         visitor.on_attribute("case_fold", m_case_fold);
@@ -74,6 +76,7 @@ class CharsMapNormalization : public ov::op::Op {
     mutable std::shared_ptr<sentencepiece::normalizer::Normalizer> m_normalizer;
 
     bool m_add_dummy_prefix = false;
+    bool m_remove_extra_whitespaces = true;
     bool m_escape_whitespaces = false;
     bool m_case_fold = false;
     bool m_nmt = false;
diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index 59d93c7e..56f87978 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -12,6 +12,7 @@
 from openvino_tokenizers import _get_factory
 from openvino_tokenizers.constants import UTF8ReplaceMode
 from openvino_tokenizers.tokenizer_pipeline import (
+    CaseFoldStep,
     CharsmapStep,
     DecodingStep,
     NormalizationStep,
@@ -208,6 +209,24 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test
     )
 
 
+
+@pytest.mark.parametrize(
+    "test_string, expected",
+    [
+        ("a", "a"),
+        ("A", "a"),
+        ("Ю", "ю"),
+        ("Σ", "σ"),
+        ("Hello World!", "hello world!"),
+    ]
+)
+def test_casefold_normalization(test_string, expected):
+    casefold = CaseFoldStep()
+    compiled_model = create_normalization_model(casefold)
+    res_ov = compiled_model([test_string])[0]
+    assert res_ov == expected
+
+
 @pytest.mark.parametrize(
     "test_string, expected, layer",
     [

From 012fb8ee1facc3691fd133e5b29504692567d81c Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 9 Jan 2025 12:03:38 +0000
Subject: [PATCH 09/30] Update tests and fix custom charsmap support

---
 src/charsmap_normalization.cpp | 4 ++--
 tests/layer_tests.py           | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp
index 5178801a..2efb6752 100644
--- a/src/charsmap_normalization.cpp
+++ b/src/charsmap_normalization.cpp
@@ -42,7 +42,7 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor
             m_spec->set_escape_whitespaces(m_escape_whitespaces);
 
             sentencepiece::normalizer::Builder::CharsMap chars_map;
-            if (m_normalization_form == "identity") {
+            if (m_normalization_form == "identity" || m_normalization_form == "") {
                 // no need to modify chars_map
             } else if (m_normalization_form == "nfc") {
                 sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map);
@@ -53,7 +53,7 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor
             } else if (m_normalization_form == "nfkd") {
                 sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map);
             } else {
-                OPENVINO_ASSERT(false, "Unsupported normalization form: " + m_normalization_form);
+                OPENVINO_ASSERT(false, "Unsupported normalization form: `" + m_normalization_form + "`");
             };
 
             if (m_case_fold) {
diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index 56f87978..a5079977 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -175,8 +175,8 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
         # results for sentencepiece charsmap:
         ("NFC", 17325),  # failed examples: 2640
         ("NFD", 17736),  # failed examples: 2229
-        ("NFKC", 17159),  # failed examples: 2806
-        ("NFKD", 17554),  # failed examples: 2411
+        ("NFKC", 17224),  # failed examples: 2741
+        ("NFKD", 17619),  # failed examples: 2346
         # results for icu70:
         # ("NFC", 19875),  # failed examples: 90
         # ("NFD", 19851),  # failed examples: 114

From 80927208afd32fb2dd47d5cd8dc2f9e69537a27d Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 9 Jan 2025 12:09:47 +0000
Subject: [PATCH 10/30] Ruff checks

---
 .../openvino_tokenizers/tokenizer_pipeline.py | 32 +++++++++++--------
 tests/conftest.py                             |  2 +-
 tests/layer_tests.py                          | 14 ++++----
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
index 66742004..9394fbae 100644
--- a/python/openvino_tokenizers/tokenizer_pipeline.py
+++ b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -161,8 +161,8 @@ class NormalizeUnicode(NormalizationStep):
     def __post_init__(self):
         if self.normalization_form not in ["NFD", "NFC", "NFKD", "NFKC"]:
             raise ValueError(
-                'NormalizeUnicode`normalization_form` attribute must be one of ["NFD", "NFC", "NFKD", "NFKC"], '
-                f'got {self.normalization_form}.'
+                '[ NormalizeUnicode ] `normalization_form` attribute must be one of ["NFD", "NFC", "NFKD", "NFKC"], '
+                f"got {self.normalization_form}."
             )
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
@@ -285,18 +285,22 @@ def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep":
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         input_nodes += make_constant_node(np.frombuffer(self.charsmap, dtype=np.uint8), dtype=Type.u8).outputs()
-        return _get_factory().create(
-            "CharsMapNormalization",
-            input_nodes,
-            {
-                "normalization_form": self.normalization_form or "",
-                "add_dummy_prefix": self.add_dummy_prefix,
-                "remove_extra_whitespaces": self.remove_extra_whitespaces,
-                "escape_whitespaces": self.escape_whitespaces,
-                "case_fold": self.case_fold,
-                "nmt": self.nmt,
-            }
-        ).outputs()
+        return (
+            _get_factory()
+            .create(
+                "CharsMapNormalization",
+                input_nodes,
+                {
+                    "normalization_form": self.normalization_form or "",
+                    "add_dummy_prefix": self.add_dummy_prefix,
+                    "remove_extra_whitespaces": self.remove_extra_whitespaces,
+                    "escape_whitespaces": self.escape_whitespaces,
+                    "case_fold": self.case_fold,
+                    "nmt": self.nmt,
+                },
+            )
+            .outputs()
+        )
 
 
 @dataclass
diff --git a/tests/conftest.py b/tests/conftest.py
index f1d3fcfc..717ebb11 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -57,7 +57,7 @@ def add_tokenizer_type(row):
         results_df.hf_tiktoken_tokenizers_with_padding_sides_param, inplace=True
     )
     results_df.status = (results_df.status == "passed").astype(int)
-    results_df = results_df.dropna(subset=['hf_wordpiece_tokenizers_param'])
+    results_df = results_df.dropna(subset=["hf_wordpiece_tokenizers_param"])
     results_df["Model"] = (
         results_df.hf_wordpiece_tokenizers_param
         + ["_legacy" * value for value in results_df.index.str.contains("Slow")]
diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index a5079977..3cdf68c4 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -2,7 +2,7 @@
 import re
 import tempfile
 from pathlib import Path
-from typing import Union, NamedTuple
+from typing import NamedTuple, Union
 
 import openvino as ov
 import pytest
@@ -16,12 +16,12 @@
     CharsmapStep,
     DecodingStep,
     NormalizationStep,
+    NormalizeUnicode,
     PreTokenizatinStep,
     RegexNormalizationStep,
     RegexSplitStep,
     TokenizerPipeline,
     UTF8ValidateStep,
-    NormalizeUnicode,
 )
 
 from tests.utils import get_hf_tokenizer
@@ -39,6 +39,7 @@ class NormalizationTestLine(NamedTuple):
     nfkd: str
     comment: str
 
+
 def parse_normalization_test_line(line):
     parts, comment = line.split("#", 1)
     parts = [part.strip() for part in parts.split(";")]
@@ -187,7 +188,7 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
         # ("NFD", 19220),  # failed examples: 745
         # ("NFKC", 19077),  # failed examples: 888
         # ("NFKD", 19050),  # failed examples: 915
-    ]
+    ],
 )
 def test_unicode_normalization_model(test_parameters, unicode_normalization_test_data):
     normalization_type, positive_threshold = test_parameters
@@ -209,7 +210,6 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test
     )
 
 
-
 @pytest.mark.parametrize(
     "test_string, expected",
     [
@@ -218,7 +218,7 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test
         ("Ю", "ю"),
         ("Σ", "σ"),
         ("Hello World!", "hello world!"),
-    ]
+    ],
 )
 def test_casefold_normalization(test_string, expected):
     casefold = CaseFoldStep()
@@ -263,9 +263,9 @@ def test_casefold_normalization(test_string, expected):
             RegexNormalizationStep(
                 regex_search_pattern=r"(^)(.+)",
                 replace_term=r"▁$2",
-            )
+            ),
         ),
-    ]
+    ],
 )
 def test_regex_normalization(test_string, expected, layer):
     compiled_model = create_normalization_model(layer)

From 6a611f345af6a7bf4358393475f793f263ac98d6 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 9 Jan 2025 14:02:34 +0000
Subject: [PATCH 11/30] wip

---
 .github/workflows/linux.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 5b3d19ac..b2980197 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -109,6 +109,7 @@ jobs:
 
       - name: CMake configure - tokenizers
         run: |
+          apt install -y libicu-dev
           source ${INSTALL_DIR}/setupvars.sh
           cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
                 -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \

From 258f0f49f7dde8eedaa1f31aac03851039a7b3df Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 9 Jan 2025 14:17:03 +0000
Subject: [PATCH 12/30] wip

---
 .github/workflows/linux.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index b2980197..6573af48 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -109,7 +109,7 @@ jobs:
 
       - name: CMake configure - tokenizers
         run: |
-          apt install -y libicu-dev
+          apt-get update && apt install -y libicu-dev
           source ${INSTALL_DIR}/setupvars.sh
           cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
                 -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \

From baf0e705831b62fa9f5f2547ee78d7b7bad62df8 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 9 Jan 2025 15:40:59 +0000
Subject: [PATCH 13/30] wip

---
 .github/workflows/linux.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 6573af48..1295ce0a 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -187,6 +187,7 @@ jobs:
 
       - name: Build tokenizers wheel
         run: |
+          apt-get update && apt install -y libicu-dev
           python -m pip wheel -v --no-deps --wheel-dir ${BUILD_DIR} \
               --config-settings=override=cross.arch="manylinux_2_31_x86_64" \
               --config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \

From 6177b81c805a24bbe6b9194b42a9efe902dbaf9f Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 10 Jan 2025 10:45:34 +0000
Subject: [PATCH 14/30] Switch Off FastTokenizer

Support UnicodeNormalization and CaseFold operations using new backend.
Do not use FastTokenizer dependency during build.
---
 .../openvino_tokenizers/tokenizer_pipeline.py | 27 ++++----
 src/CMakeLists.txt                            |  2 +-
 src/case_fold.cpp                             | 30 ++++++---
 src/case_fold.hpp                             |  8 ++-
 src/charsmap_normalization.cpp                | 57 +++++++++-------
 src/normalize_unicode.cpp                     | 65 ++++++++++++++-----
 src/normalize_unicode.hpp                     | 10 +--
 src/ov_extension.cpp                          | 23 ++-----
 src/tensorflow_translators.cpp                | 11 ----
 src/tensorflow_translators.hpp                |  3 -
 src/tokenizer.hpp                             |  3 -
 tests/layer_tests.py                          | 21 +++---
 12 files changed, 143 insertions(+), 117 deletions(-)

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
index 72ef0d16..c145292e 100644
--- a/python/openvino_tokenizers/tokenizer_pipeline.py
+++ b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -192,19 +192,22 @@ def __post_init__(self):
             )
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
-        return (
-            _get_factory()
-            .create(
-                "CharsMapNormalization",
-                input_nodes,
-                {
-                    "normalization_form": "identity",
-                    "case_fold": True,
-                    "remove_extra_whitespaces": False,
-                },
+        if self.encoding == "":
+            return _get_factory().create("CaseFold", input_nodes, {"encoding": self.encoding}).outputs()
+        else:
+            return (
+                _get_factory()
+                .create(
+                    "CharsMapNormalization",
+                    input_nodes,
+                    {
+                        "normalization_form": "identity",
+                        "case_fold": True,
+                        "remove_extra_whitespaces": False,
+                    },
+                )
+                .outputs()
             )
-            .outputs()
-        )
 
 
 @dataclass
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 97aae478..942521f9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -82,7 +82,7 @@ else()
   endif()
 endif()
 
-cmake_dependent_option(ENABLE_FAST_TOKENIZERS "Enables Fast Tokenizers usage in OpenVINO Tokenizers" ON "FAST_TOKENIZERS_SUPPORTED" OFF)
+cmake_dependent_option(ENABLE_FAST_TOKENIZERS "Enables Fast Tokenizers usage in OpenVINO Tokenizers" OFF "FAST_TOKENIZERS_SUPPORTED" OFF)
 
 if(ENABLE_FAST_TOKENIZERS)
   # The option is forced to ON if _GLIBCXX_USE_CXX11_ABI=0 or on Android (where prebuilt version is not available)
diff --git a/src/case_fold.cpp b/src/case_fold.cpp
index 043248dc..1c0821be 100644
--- a/src/case_fold.cpp
+++ b/src/case_fold.cpp
@@ -2,12 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#ifdef ENABLE_FAST_TOKENIZERS
-
 #include "case_fold.hpp"
 #include "utils.hpp"
-
-#include "fast_tokenizer/normalizers/normalizers.h"
+#include "builder.h"  // for making normalizer spec
 
 using namespace ov;
 
@@ -31,6 +28,24 @@ void CaseFold::validate_and_infer_types() {
 bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
     const bool has_skips = (inputs.size() == 4);
 
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_normalizer == nullptr && m_encoding == "utf-8") {
+            m_spec = std::make_shared<sentencepiece::NormalizerSpec>();
+            m_spec->set_add_dummy_prefix(false);
+            m_spec->set_remove_extra_whitespaces(true);
+            m_spec->set_escape_whitespaces(false);
+
+            sentencepiece::normalizer::Builder::CharsMap chars_map;
+            sentencepiece::normalizer::Builder::MergeUnicodeCaseFoldMap(&chars_map);
+            std::string precompiled_charsmap;
+            sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            m_spec->set_precompiled_charsmap(precompiled_charsmap);
+
+            m_normalizer = std::make_shared<sentencepiece::normalizer::Normalizer>(*m_spec);
+        }
+    }
     if (m_encoding.empty()) {
         return evaluate_normalization_helper(
             outputs, inputs,
@@ -45,12 +60,9 @@ bool CaseFold::evaluate(ov::TensorVector& outputs, const ov::TensorVector& input
         return evaluate_normalization_helper(
             outputs,
             inputs,
-            [](const std::string& str) {
-                using namespace paddlenlp::fast_tokenizer;
-                return normalizers::NormalizedString(str).Lowercase().GetStr();
+            [&](const std::string& str) {
+                return m_normalizer->Normalize(str);
             },
             has_skips);
         }
 }
-
-#endif // ENABLE_FAST_TOKENIZERS
diff --git a/src/case_fold.hpp b/src/case_fold.hpp
index 9ae3a75c..bfdf99d5 100644
--- a/src/case_fold.hpp
+++ b/src/case_fold.hpp
@@ -4,8 +4,7 @@
 
 #pragma once
 
-#ifdef ENABLE_FAST_TOKENIZERS
-
+#include "normalizer.h"  // from sentencepiece
 #include <openvino/op/op.hpp>
 
 class CaseFold : public ov::op::Op {
@@ -40,6 +39,9 @@ class CaseFold : public ov::op::Op {
 
 private:
     std::string m_encoding = "utf-8";
+    mutable std::shared_ptr<sentencepiece::normalizer::Normalizer> m_normalizer;
+    // spec should be preserved for the lifetime of the normalizer
+    mutable std::shared_ptr<sentencepiece::NormalizerSpec> m_spec;
+    mutable std::mutex m_mutex;
 };
 
-#endif // ENABLE_FAST_TOKENIZERS
diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp
index 2efb6752..0d0dae19 100644
--- a/src/charsmap_normalization.cpp
+++ b/src/charsmap_normalization.cpp
@@ -30,6 +30,31 @@ void CharsMapNormalization::validate_and_infer_types() {
     };
 }
 
+
+inline void init_sentencepiece_normalizer_chars_map(
+    const std::string& normalization_form,
+    const bool case_fold,
+    sentencepiece::normalizer::Builder::CharsMap& chars_map
+) {
+    if (normalization_form == "identity") {
+        // no need to modify chars_map
+    } else if (normalization_form == "nfc") {
+        sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map);
+    } else if (normalization_form == "nfd") {
+        sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map);
+    } else if (normalization_form == "nfkc") {
+        sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map);
+    } else if (normalization_form == "nfkd") {
+        sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map);
+    } else {
+        OPENVINO_ASSERT(false, "Unsupported normalization form: `" + normalization_form + "`");
+    };
+    if (case_fold) {
+        sentencepiece::normalizer::Builder::MergeUnicodeCaseFoldMap(&chars_map);
+    };
+}
+
+
 bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
     const bool has_skips = (inputs.size() == 5) || (m_normalization_form != "" && inputs.size() == 4);
     {
@@ -41,31 +66,14 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor
             m_spec->set_remove_extra_whitespaces(m_remove_extra_whitespaces);
             m_spec->set_escape_whitespaces(m_escape_whitespaces);
 
-            sentencepiece::normalizer::Builder::CharsMap chars_map;
-            if (m_normalization_form == "identity" || m_normalization_form == "") {
-                // no need to modify chars_map
-            } else if (m_normalization_form == "nfc") {
-                sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map);
-            } else if (m_normalization_form == "nfd") {
-                sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map);
-            } else if (m_normalization_form == "nfkc") {
-                sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map);
-            } else if (m_normalization_form == "nfkd") {
-                sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map);
-            } else {
-                OPENVINO_ASSERT(false, "Unsupported normalization form: `" + m_normalization_form + "`");
-            };
-
-            if (m_case_fold) {
-                sentencepiece::normalizer::Builder::MergeUnicodeCaseFoldMap(&chars_map);
-            };
-
             std::string precompiled_charsmap;
-            if (m_normalization_form == "") {
-                precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
-            } else {
+            if (m_normalization_form != "") {
+                sentencepiece::normalizer::Builder::CharsMap chars_map;
+                init_sentencepiece_normalizer_chars_map(m_normalization_form, m_case_fold, chars_map);
                 sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
-            }
+            } else {
+                precompiled_charsmap = std::string(inputs[3 + has_skips].data<const char>(), inputs[3 + has_skips].get_size());
+            };
             m_spec->set_precompiled_charsmap(precompiled_charsmap);
 
             m_normalizer = std::make_shared<sentencepiece::normalizer::Normalizer>(*m_spec);
@@ -76,8 +84,7 @@ bool CharsMapNormalization::evaluate(ov::TensorVector& outputs, const ov::Tensor
         outputs,
         inputs,
         [&](const std::string& str) {
-            auto norm = m_normalizer->Normalize(str);
-            return norm;
+            return m_normalizer->Normalize(str);
         },
         has_skips
     );
diff --git a/src/normalize_unicode.cpp b/src/normalize_unicode.cpp
index a8c07f50..39c6c999 100644
--- a/src/normalize_unicode.cpp
+++ b/src/normalize_unicode.cpp
@@ -2,36 +2,41 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#ifdef ENABLE_FAST_TOKENIZERS
-
 #ifdef _MSC_VER
 #    pragma warning(disable : 4251)
 #    pragma warning(disable : 4275)
 #endif
 
-#include "fast_tokenizer/normalizers/normalizers.h"
-
 #include "normalize_unicode.hpp"
 #include "utils.hpp"
+#include "builder.h"  // for making normalizer spec
 
 using namespace ov;
 
-namespace {
-using namespace paddlenlp::fast_tokenizer::normalizers;
-using NormalizersMap = std::map<std::string, std::function<std::string(const std::string&)>>;
-
-const NormalizersMap normalizers = {
-    {"NFD", [](const std::string& str) { return NormalizedString(str).NFD().GetStr(); }},
-    {"NFC", [](const std::string& str) { return NormalizedString(str).NFC().GetStr(); }},
-    {"NFKD", [](const std::string& str) { return NormalizedString(str).NFKD().GetStr(); }},
-    {"NFKC", [](const std::string& str) { return NormalizedString(str).NFKC().GetStr(); }},
-};
 
+inline void init_unicode_normalizer_chars_map(
+    const std::string& normalization_form,
+    sentencepiece::normalizer::Builder::CharsMap& chars_map
+) {
+    if (normalization_form == "NFC") {
+        sentencepiece::normalizer::Builder::BuildNFCMap(&chars_map);
+    } else if (normalization_form == "NFD") {
+        sentencepiece::normalizer::Builder::BuildNFDMap(&chars_map);
+    } else if (normalization_form == "NFKC") {
+        sentencepiece::normalizer::Builder::BuildNFKCMap(&chars_map);
+    } else if (normalization_form == "NFKD") {
+        sentencepiece::normalizer::Builder::BuildNFKDMap(&chars_map);
+    } else {
+        OPENVINO_ASSERT(false, "Unsupported normalization form: `" + normalization_form + "`");
+    };
 }
 
+
 void NormalizeUnicode::validate_and_infer_types() {
     check_string_input(this, 0);
-    OPENVINO_ASSERT(normalizers.find(m_normalization_form) != normalizers.end(), "NormalizeUnicode doesn't know normalization form ", m_normalization_form);
+    OPENVINO_ASSERT(
+        m_normalization_form == "NFC" || m_normalization_form == "NFD" || m_normalization_form == "NFKC" || m_normalization_form == "NFKD",
+        "NormalizeUnicode doesn't know normalization form ", m_normalization_form);
     set_string_output(this, 0, get_input_partial_shape(0));
 
     auto input_size = get_input_size();
@@ -44,7 +49,31 @@ void NormalizeUnicode::validate_and_infer_types() {
 
 bool NormalizeUnicode::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
     const bool has_skips = (inputs.size() == 4);
-    return evaluate_normalization_helper(outputs, inputs, normalizers.at(m_normalization_form), has_skips);
-}
 
-#endif // ENABLE_FAST_TOKENIZERS
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_normalizer == nullptr) {
+            m_spec = std::make_shared<sentencepiece::NormalizerSpec>();
+            m_spec->set_add_dummy_prefix(false);
+            m_spec->set_remove_extra_whitespaces(true);
+            m_spec->set_escape_whitespaces(false);
+
+            sentencepiece::normalizer::Builder::CharsMap chars_map;
+            init_unicode_normalizer_chars_map(m_normalization_form, chars_map);
+            std::string precompiled_charsmap;
+            sentencepiece::normalizer::Builder::CompileCharsMap(chars_map, &precompiled_charsmap);
+            m_spec->set_precompiled_charsmap(precompiled_charsmap);
+
+            m_normalizer = std::make_shared<sentencepiece::normalizer::Normalizer>(*m_spec);
+        }
+    }
+    return evaluate_normalization_helper(
+        outputs,
+        inputs,
+        [&](const std::string& str) {
+            return m_normalizer->Normalize(str);
+        },
+        has_skips
+    );
+}
diff --git a/src/normalize_unicode.hpp b/src/normalize_unicode.hpp
index 12c8043e..04c832d6 100644
--- a/src/normalize_unicode.hpp
+++ b/src/normalize_unicode.hpp
@@ -4,8 +4,7 @@
 
 #pragma once
 
-#ifdef ENABLE_FAST_TOKENIZERS
-
+#include "normalizer.h"  // from sentencepiece
 #include <openvino/op/op.hpp>
 
 class NormalizeUnicode : public ov::op::Op {
@@ -38,8 +37,9 @@ class NormalizeUnicode : public ov::op::Op {
     }
 
 private:
-
     std::string m_normalization_form = "NFD";
+    mutable std::shared_ptr<sentencepiece::normalizer::Normalizer> m_normalizer;
+    // spec should be preserved for the lifetime of the normalizer
+    mutable std::shared_ptr<sentencepiece::NormalizerSpec> m_spec;
+    mutable std::mutex m_mutex;
 };
-
-#endif // ENABLE_FAST_TOKENIZERS
diff --git a/src/ov_extension.cpp b/src/ov_extension.cpp
index 7369fe42..5de761b7 100644
--- a/src/ov_extension.cpp
+++ b/src/ov_extension.cpp
@@ -20,25 +20,10 @@
     std::make_shared<ov::frontend::ConversionExtension>("Equal", translate_equal),                                                   \
     std::make_shared<ov::frontend::ConversionExtension>("StringToHashBucketFast", translate_string_to_hash_bucket_fast),             \
     std::make_shared<ov::frontend::ConversionExtension>("Squeeze", translate_squeeze_op),                                            \
-    std::make_shared<ov::frontend::ConversionExtension>("WordpieceTokenizeWithOffsets", translate_wordpiece_tokenize_with_offsets),
-
-#ifdef ENABLE_FAST_TOKENIZERS
-
-#define OPENVINO_TOKENIZERS_FAST_TOKENIZER_BASED_EXTENSIONS    \
-    std::make_shared<ov::OpExtension<CaseFold>>(),             \
-    std::make_shared<ov::OpExtension<NormalizeUnicode>>(),
-
-#define OPENVINO_TOKENIZERS_TENSORFLOW_CONVERSION_EXTENSIONS_FAST_TOKENIZER_BASED                                                    \
+    std::make_shared<ov::frontend::ConversionExtension>("WordpieceTokenizeWithOffsets", translate_wordpiece_tokenize_with_offsets),  \
     std::make_shared<ov::frontend::ConversionExtension>("StringLower", translate_string_lower),                                      \
     std::make_shared<ov::frontend::ConversionExtension>("NormalizeUTF8", translate_normalize_utf8),                                  \
-    std::make_shared<ov::frontend::ConversionExtension>("CaseFoldUTF8", translate_case_fold_utf8),
-
-#else
-
-#define OPENVINO_TOKENIZERS_FAST_TOKENIZER_BASED_EXTENSIONS
-#define OPENVINO_TOKENIZERS_TENSORFLOW_CONVERSION_EXTENSIONS_FAST_TOKENIZER_BASED
-
-#endif // ENABLE_FAST_TOKENIZERS
+    std::make_shared<ov::frontend::ConversionExtension>("CaseFoldUTF8", translate_case_fold_utf8)
 
 // clang-format off
 //! [ov_extension:entry_point]
@@ -70,9 +55,9 @@ OPENVINO_CREATE_EXTENSIONS(
             std::make_shared<ov::OpExtension<SentencepieceTokenizer>>(),
             std::make_shared<ov::OpExtension<SentencepieceDetokenizer>>(),
             std::make_shared<ov::OpExtension<SentencepieceStreamDetokenizer>>(),
-            OPENVINO_TOKENIZERS_FAST_TOKENIZER_BASED_EXTENSIONS
+            std::make_shared<ov::OpExtension<CaseFold>>(),
+            std::make_shared<ov::OpExtension<NormalizeUnicode>>(),
             OPENVINO_TOKENIZERS_TENSORFLOW_CONVERSION_EXTENSIONS
-            OPENVINO_TOKENIZERS_TENSORFLOW_CONVERSION_EXTENSIONS_FAST_TOKENIZER_BASED
 }));
 //! [ov_extension:entry_point]
 // clang-format on
diff --git a/src/tensorflow_translators.cpp b/src/tensorflow_translators.cpp
index e279d752..03c80192 100644
--- a/src/tensorflow_translators.cpp
+++ b/src/tensorflow_translators.cpp
@@ -23,11 +23,8 @@
 #include "string_to_hash_bucket.hpp"
 #include "vocab_encoder.hpp"
 #include "wordpiece_tokenizer.hpp"
-
-#ifdef ENABLE_FAST_TOKENIZERS
 #include "case_fold.hpp"
 #include "normalize_unicode.hpp"
-#endif // ENABLE_FAST_TOKENIZERS
 
 using namespace ov;
 using namespace ov::op;
@@ -156,8 +153,6 @@ NamedOutputVector translate_ragged_tensor_to_sparse(const NodeContext& node) {
     return named_results;
 }
 
-#ifdef ENABLE_FAST_TOKENIZERS
-
 ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node) {
     FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "CaseFold expects only 1 input");
     return { post_translate_string_tensor_output(std::make_shared<CaseFold>(
@@ -171,8 +166,6 @@ ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node)
         node.get_attribute<std::string>("normalization_form"))->outputs()) };
 }
 
-#endif // ENABLE_FAST_TOKENIZERS
-
 ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node) {
     auto node_name = node.get_name();
     FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "StaticRegexReplace expects only 1 input");
@@ -221,8 +214,6 @@ ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::N
     return { post_translate_ragged_tensor_output(wp_tokenizer->outputs()) };
 }
 
-#ifdef ENABLE_FAST_TOKENIZERS
-
 ov::OutputVector translate_string_lower(const ov::frontend::NodeContext& node) {
     auto node_name = node.get_name();
     FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "StringLower expects only 1 input");
@@ -233,8 +224,6 @@ ov::OutputVector translate_string_lower(const ov::frontend::NodeContext& node) {
     return { string_lower_result };
 }
 
-#endif // ENABLE_FAST_TOKENIZERS
-
 OutputVector translate_lookup_table_find_op(const ov::frontend::NodeContext& node) {
     FRONT_END_GENERAL_CHECK(node.get_input_size() == 3, "LookupTableFind or LookupTableFindV2 expects 3 inputs");
     auto table_handle = as_type_ptr<ov::frontend::HashTable>(node.get_input_by_reference(0).get_node_shared_ptr());
diff --git a/src/tensorflow_translators.hpp b/src/tensorflow_translators.hpp
index 4dbc26b2..250d3c60 100644
--- a/src/tensorflow_translators.hpp
+++ b/src/tensorflow_translators.hpp
@@ -17,9 +17,6 @@ ov::OutputVector translate_equal(const ov::frontend::NodeContext& node);
 ov::OutputVector translate_string_to_hash_bucket_fast(const ov::frontend::NodeContext& node);
 ov::OutputVector translate_squeeze_op(const ov::frontend::NodeContext& node);
 ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::NodeContext& node);
-
-#ifdef ENABLE_FAST_TOKENIZERS
 ov::OutputVector translate_string_lower(const ov::frontend::NodeContext& node);
 ov::OutputVector translate_case_fold_utf8(const ov::frontend::NodeContext& node);
 ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node);
-#endif // ENABLE_FAST_TOKENIZERS
diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp
index 343fe032..e5c22314 100644
--- a/src/tokenizer.hpp
+++ b/src/tokenizer.hpp
@@ -28,10 +28,7 @@
 #include "special_tokens_split.hpp"
 #include "charsmap_normalization.hpp"
 #include "wordpiece_tokenizer.hpp"
-
-#ifdef ENABLE_FAST_TOKENIZERS
 #include "case_fold.hpp"
 #include "normalize_unicode.hpp"
-#endif // ENABLE_FAST_TOKENIZERS
 
 #include "tensorflow_translators.hpp"
diff --git a/tests/layer_tests.py b/tests/layer_tests.py
index 9dd0f9c8..c9d53092 100644
--- a/tests/layer_tests.py
+++ b/tests/layer_tests.py
@@ -211,17 +211,22 @@ def test_unicode_normalization_model(test_parameters, unicode_normalization_test
 
 
 @pytest.mark.parametrize(
-    "test_string, expected",
+    "test_string, expected, is_uft8",
     [
-        ("a", "a"),
-        ("A", "a"),
-        ("Ю", "ю"),
-        ("Σ", "σ"),
-        ("Hello World!", "hello world!"),
+        ("a", "a", True),
+        ("a", "a", False),
+        ("A", "a", True),
+        ("A", "a", False),
+        ("Ю", "ю", True),
+        ("Ю", "Ю", False),
+        ("Σ", "σ", True),
+        ("Σ", "Σ", False),
+        ("Hello World!", "hello world!", True),
+        ("Hello World!", "hello world!", False),
     ],
 )
-def test_casefold_normalization(test_string, expected):
-    casefold = CaseFoldStep()
+def test_casefold_normalization(test_string, expected, is_uft8):
+    casefold = CaseFoldStep("utf-8" if is_uft8 else "")
     compiled_model = create_normalization_model(casefold)
     res_ov = compiled_model([test_string])[0]
     assert res_ov == expected

From 68b7e4e6f7885bf37e9bc73bcbcd155ccf3e0c0c Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 10 Jan 2025 10:59:01 +0000
Subject: [PATCH 15/30] Delete torch from dependencies

---
 pyproject.toml           | 6 +-----
 tests/tokenizers_test.py | 2 --
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1cf5ba0d..22ca6ba2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,17 +41,13 @@ transformers = [
     "transformers[sentencepiece] >= 4.36.0",
     "tiktoken"
 ]
-# chatglm2 custom tokenizer file imports torch, have to add torch dependency for tests
-torch = [
-    'torch'
-]
 dev = [
     "ruff",
     "bandit",
     "pytest",
     "pytest_harvest",
     "pandas",
-    "openvino_tokenizers[transformers, torch]"
+    "openvino_tokenizers[transformers]"
 ]
 benchmark = [
     "pandas",
diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py
index 1c021ce7..2cb8356f 100644
--- a/tests/tokenizers_test.py
+++ b/tests/tokenizers_test.py
@@ -124,8 +124,6 @@
     "xlm-roberta-base",
     "microsoft/deberta-v3-base",
     "xlnet-base-cased",
-    # "THUDM/chatglm-6b",  # hf_tokenizer init error
-    # "THUDM/chatglm2-6b",  # _pad doesn't support padding side - broke in 4.45
     # "THUDM/chatglm3-6b",  # _pad doesn't support padding side - broke in 4.45
     "t5-base",
     "facebook/musicgen-small",

From 7244191308afe7fa570437ab74bb698a76696373 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 10 Jan 2025 11:15:04 +0000
Subject: [PATCH 16/30] Delete FastTokenizer from cmake and readme

---
 README.md          |  71 -----------------
 src/CMakeLists.txt | 193 +--------------------------------------------
 2 files changed, 1 insertion(+), 263 deletions(-)

diff --git a/README.md b/README.md
index 7678bbb2..dc6c3c52 100644
--- a/README.md
+++ b/README.md
@@ -150,77 +150,6 @@ make
 
 After that, you can transfer all binaries from `build/src` to `<openvino_dir>` as described in the C++ installation instruction above.
 
-### Reducing the ICU Data Size
-
-By default, all available ICU locales are supported, which significantly increases the package size. To reduce the size of the ICU libraries included in your final package, follow these steps:
-
-1. **Use the ICU Data Configuration File**:
-    - This file specifies which features and locales to include in a custom data bundle. You can find more information [here](https://unicode-org.github.io/icu/userguide/icu_data/buildtool.html#icu-data-configuration-file).
-
-2. **Set the ICU Data Filter File as an Environment Variable**:
-    - **On Unix-like systems (Linux, macOS)**:
-      Set the `ICU_DATA_FILTER_FILE` environment variable to the path of your configuration file (`filters.json`):
-
-      ```bash
-      export ICU_DATA_FILTER_FILE="filters.json"
-      ```
-
-    - **On Windows**:
-      Set the `ICU_DATA_FILTER_FILE` environment variable using the Command Prompt or PowerShell:
-
-      **Command Prompt:**
-      ```cmd
-      set ICU_DATA_FILTER_FILE=filters.json
-      ```
-
-      **PowerShell:**
-      ```powershell
-      $env:ICU_DATA_FILTER_FILE="filters.json"
-      ```
-
-3. **Create a Configuration File**:
-    - An example configuration file (`filters.json`) might look like this:
-
-    ```json
-    {
-      "localeFilter": {
-        "filterType": "language",
-        "includelist": [
-          "en"
-        ]
-      }
-    }
-    ```
-
-4. **Configure OpenVINO Tokenizers**:
-    - When building OpenVINO tokenizers, set the following CMake option during the project configuration:
-
-    ```bash
-    -DBUILD_FAST_TOKENIZERS=ON
-    ```
-   - Example for a pip installation path:
-   ```bash
-   ICU_DATA_FILTER_FILE=</path/to/filters.json> pip install git+https://github.com/openvinotoolkit/openvino_tokenizers.git --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS=ON
-   ```
-
-By following these instructions, you can effectively reduce the size of the ICU libraries in your final package.
-
-### Build OpenVINO Tokenizers without FastTokenizer Library
-
-If a tokenizer doesn't use `CaseFold`, `UnicodeNormalization` or `Wordpiece` operations, you can drastically reduce package binary size by building OpenVINO Tokenizers without FastTokenizer dependency with this flag:
-
-```bash
--DENABLE_FAST_TOKENIZERS=OFF
-```
-
-This option can also help with building for platform that is supported by FastTokenizer, for example `Android x86_64`.
-
-Example for a pip installation path:
-```bash
-
-pip install git+https://github.com/openvinotoolkit/openvino_tokenizers.git --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --config-settings=override=cmake.options.ENABLE_FAST_TOKENIZERS=OFF
-```
-
 ## Usage
 
 :warning: OpenVINO Tokenizers can be inferred on a `CPU` device only.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 942521f9..c9277003 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -60,37 +60,6 @@ if("_GLIBCXX_USE_CXX11_ABI=0" IN_LIST OPENVINO_RUNTIME_COMPILE_DEFINITIONS)
   set(USE_ABI0 ON CACHE BOOL "Set -D_GLIBCXX_USE_CXX11_ABI to 0 for fast_tokenizers")
 endif()
 
-if(ANDROID)
-  if(AARCH64 OR ARM)
-    set(FAST_TOKENIZERS_SUPPORTED ON)
-    if(ANDROID_NATIVE_API_LEVEL LESS 33)
-      message(FATAL_ERROR "FastTokenizers require ANDROID_NATIVE_API_LEVEL to be higher than 33. Please, either disable FastTokenizers or set ANDROID_NATIVE_API_LEVEL / ANDROID_PLATFORM")
-    endif()
-  elseif(X86_64 OR RISCV64)
-    message(WARNING "FastTokenizers are not available on ${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}. ENABLE_FAST_TOKENIZERS is set to OFF")
-    set(FAST_TOKENIZERS_SUPPORTED OFF)
-  else()
-    message(WARNING "Unsupport Android ${CMAKE_SYSTEM_PROCESSOR}. Please, contact OpenVINO Tokenizers developers")
-  endif()
-  set(FAST_TOKENIZER_FROM_SOURCES ON)
-else()
-  set(FAST_TOKENIZERS_SUPPORTED ON)
-  if(USE_ABI0 OR (WIN32 AND CMAKE_BUILD_TYPE STREQUAL "Debug"))
-    set(FAST_TOKENIZER_FROM_SOURCES ON)
-  else()
-    set(FAST_TOKENIZER_FROM_SOURCES OFF)
-  endif()
-endif()
-
-cmake_dependent_option(ENABLE_FAST_TOKENIZERS "Enables Fast Tokenizers usage in OpenVINO Tokenizers" OFF "FAST_TOKENIZERS_SUPPORTED" OFF)
-
-if(ENABLE_FAST_TOKENIZERS)
-  # The option is forced to ON if _GLIBCXX_USE_CXX11_ABI=0 or on Android (where prebuilt version is not available)
-  cmake_dependent_option(BUILD_FAST_TOKENIZERS "Compile core_tokenizers instead of downloading prebuilt library" OFF "NOT FAST_TOKENIZER_FROM_SOURCES" ON)
-else()
-  set(BUILD_FAST_TOKENIZERS OFF CACHE BOOL "Compile core_tokenizers instead of downloading prebuilt library" FORCE)
-endif()
-
 #
 # Compile flags
 #
@@ -122,18 +91,6 @@ if(WIN32 OR APPLE)
   set(CMAKE_DEBUG_POSTFIX "d")
 endif()
 
-if(BUILD_FAST_TOKENIZERS)
-  set(THIRD_PARTY_BUILD_TYPE ${CMAKE_BUILD_TYPE})
-  # Set FastTokenizers to use dynamic MSVC runtime
-  set(MSVC_STATIC_CRT OFF)  # PADDLE_LIB
-  set(PCRE2_STATIC_RUNTIME OFF) # PCRE2_LIB
-  set(SPM_ENABLE_MSVC_MT_BUILD OFF) # sentencepiece libs
-  if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    ov_tokenizers_set_flags("CMAKE_CXX_FLAGS_RELEASE;CMAKE_CXX_FLAGS_MINSIZEREL;CMAKE_CXX_FLAGS_RELWITHDEBINFO" "/MD" "/MT")
-    ov_tokenizers_set_flags("CMAKE_CXX_FLAGS_DEBUG" "/MDd" "/MT")
-  endif()
-endif()
-
 #
 # Dependencies
 #
@@ -235,114 +192,7 @@ function(ov_tokenizers_build_static_re2)
   target_compile_definitions(re2 PUBLIC $<TARGET_PROPERTY:openvino::runtime,INTERFACE_COMPILE_DEFINITIONS>)
 endfunction()
 
-if(BUILD_FAST_TOKENIZERS)
-  set(EXTERNAL_PROJECT_SOURCE_DIR ${CMAKE_BINARY_DIR}/_deps/fast_tokenizer/src)
-  set(EXTERNAL_PROJECT_BINARY_DIR ${CMAKE_BINARY_DIR}/_deps/fast_tokenizer/build)
-  set(EXTERNAL_PROJECT_SUBBUILD_DIR ${CMAKE_BINARY_DIR}/_deps/fast_tokenizer/sub-build)
-
-  FetchContent_Declare(
-    fast_tokenizer
-    URL      https://github.com/PaddlePaddle/PaddleNLP/archive/refs/tags/v2.6.1.tar.gz
-    URL_HASH SHA256=10e3489bc91e938c449a0448fa719e4536803ed6b1c1c95b3402430d6a8a221a
-    PATCH_COMMAND git --git-dir=${EXTERNAL_PROJECT_SOURCE_DIR} apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/patches/fast_tokenizers.patch" &&
-                  git --git-dir=${EXTERNAL_PROJECT_SOURCE_DIR} apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/patches/glog.patch" &&
-                  git --git-dir=${EXTERNAL_PROJECT_SOURCE_DIR} apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/patches/gflags.patch" &&
-                  git --git-dir=${EXTERNAL_PROJECT_SOURCE_DIR} apply --ignore-whitespace "${CMAKE_CURRENT_LIST_DIR}/patches/icu.patch"
-    SOURCE_DIR ${EXTERNAL_PROJECT_SOURCE_DIR}
-    BINARY_DIR ${EXTERNAL_PROJECT_BINARY_DIR}
-    SUBBUILD_DIR ${EXTERNAL_PROJECT_SUBBUILD_DIR}
-  )
-
-  FetchContent_GetProperties(fast_tokenizer)
-  if(NOT fast_tokenizer_POPULATED)
-    FetchContent_Populate(
-      fast_tokenizer
-    )
-    set(EXTERNAL_OPTIONAL_ARGS
-      -DCMAKE_POLICY_DEFAULT_CMP0057=NEW
-      -DCMAKE_POLICY_DEFAULT_CMP0135=NEW)
-    set(WITH_PYTHON OFF)
-    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
-    add_subdirectory(${fast_tokenizer_SOURCE_DIR}/fast_tokenizer
-                     ${CMAKE_CURRENT_BINARY_DIR}/fast_tokenizer
-                     EXCLUDE_FROM_ALL
-                    )
-  endif()
-
-  # variables used later
-  set(FAST_TOKENIZER_INCS
-    "${fast_tokenizer_SOURCE_DIR}/fast_tokenizer"
-    "${CMAKE_BINARY_DIR}/third_party/dart/src/extern_dart/include/"
-    "${CMAKE_BINARY_DIR}/third_party/json/src/extern_json/single_include/"
-    "${CMAKE_BINARY_DIR}/third_party/install/re2/include/")
-  set(FAST_TOKENIZER_LIBS core_tokenizers)
-elseif(ENABLE_FAST_TOKENIZERS)
-  if(WIN32 AND X86_64)
-      FetchContent_Declare(
-        fast_tokenizer
-        URL      https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-win-x64-1.0.2.zip
-        URL_HASH SHA256=56470954014bdd3c8c8ad702d20f5f6aa5ab913bff92fd9c3c49ec6da31ff11d
-      )
-      ov_tokenizers_build_static_re2()
-  elseif(LINUX AND X86_64)
-      FetchContent_Declare(
-        fast_tokenizer
-        URL      https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.2.tgz
-        URL_HASH SHA256=843a8299b55ef2e06ea50ba0d4ab4cb05b9e4cdb7cb8e29f3d55c494a1b7aecc
-      )
-  elseif(LINUX AND AARCH64)
-      FetchContent_Declare(
-        fast_tokenizer
-        URL      https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-aarch64-1.0.2.tgz
-        URL_HASH SHA256=fc16c51b24a954ae3d659e1b233ce15349eafc1e4c72710b51a4f12fb2c03033
-      )
-  elseif(APPLE AND X86_64)
-      FetchContent_Declare(
-        fast_tokenizer
-        URL      https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-x86_64-1.0.2.tgz
-        # TODO: restore once https://github.com/PaddlePaddle/PaddleNLP/issues/7505 is fixed
-        # URL_HASH SHA256=4c8123ad941b3e4325ef72f328db545e34d5eec2de3e2545e1ab8ebeeb5146a9
-      )
-  elseif(APPLE AND AARCH64)
-      FetchContent_Declare(
-        fast_tokenizer
-        URL      https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-arm64-1.0.2.tgz
-        URL_HASH SHA256=ffb0f16ec96b2f5dbdb681d00d74e932e273ec1c2108196d13f2fd28abc4d266
-      )
-  else()
-    message(FATAL_ERROR "Platform ${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR} does not have prebuilt Fast Tokenizer"
-                        "Please, use -DBUILD_FAST_TOKENIZERS=ON cmake option to enable build from soures")
-  endif()
-
-  FetchContent_MakeAvailable(fast_tokenizer)
-
-  # to allow find_library to work with conda-forge env
-  set(_old_CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ${CMAKE_FIND_ROOT_PATH_MODE_LIBRARY})
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
-  include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake")
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ${_old_CMAKE_FIND_ROOT_PATH_MODE_LIBRARY})
-  # since FastTokenizers.cmake overrides C++ standard, let's override it once again to required one
-  ov_tokenizers_set_cxx_standard()
-
-  if(WIN32 AND X86_64)
-      # we use re2 library in regex_normalization operation, so have to add to this list
-      # because prebuilt fast_tokenizers package does not provide this library
-      list(APPEND FAST_TOKENIZER_LIBS re2)
-  endif()
-else()
-  # in case if we don't build fast tokenizers, we have to include re2 explicitly
-  ov_tokenizers_build_static_re2()
-endif()
-
-function(ov_tokenizers_link_fast_tokenizer TARGET_NAME)
-  if(ENABLE_FAST_TOKENIZERS)
-    target_include_directories(${TARGET_NAME} SYSTEM PRIVATE ${FAST_TOKENIZER_INCS})
-    target_link_libraries(${TARGET_NAME} PRIVATE ${FAST_TOKENIZER_LIBS})
-    target_compile_definitions(${TARGET_NAME} PRIVATE ENABLE_FAST_TOKENIZERS)
-  else()
-    message(FATAL_ERROR "ENABLE_FAST_TOKENIZERS is turned off. This function must not be called")
-  endif()
-endfunction()
+ov_tokenizers_build_static_re2()
 
 function(ov_tokenizers_link_pcre2 TARGET_NAME)
   FetchContent_Declare(
@@ -369,9 +219,7 @@ function(ov_tokenizers_link_pcre2 TARGET_NAME)
 endfunction()
 
 function(ov_tokenizers_link_re2 TARGET_NAME)
-  if(NOT ENABLE_FAST_TOKENIZERS)
     target_link_libraries(${TARGET_NAME} PRIVATE re2)
-  endif()
 endfunction()
 
 #
@@ -387,9 +235,6 @@ add_library(${TARGET_NAME} SHARED ${SRCS})
 #
 
 ov_tokenizers_link_sentencepiece(${TARGET_NAME})
-if(ENABLE_FAST_TOKENIZERS)
-  ov_tokenizers_link_fast_tokenizer(${TARGET_NAME})
-endif()
 ov_tokenizers_link_pcre2(${TARGET_NAME})
 ov_tokenizers_link_re2(${TARGET_NAME})
 
@@ -399,36 +244,6 @@ set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_OPTIONS "${extra_flags}"
 target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_OPENVINO_EXTENSION_API)
 target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime openvino::threading)
 
-#
-# Post build steps to copy core_tokenizers dependencies
-#
-
-if(ENABLE_FAST_TOKENIZERS)
-  if(BUILD_FAST_TOKENIZERS)
-    set(fast_tokenezers_libs_dir "${CMAKE_BINARY_DIR}/third_party/icu/src/extern_icu/icu4c/bin64")
-  else()
-    set(fast_tokenezers_libs_dir "${fast_tokenizer_SOURCE_DIR}/third_party/lib")
-    if(WIN32 AND X86_64)
-      set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll")
-    elseif(LINUX)
-      set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/libcore_tokenizers.so")
-    elseif(APPLE)
-      set(extra_libs "${fast_tokenizer_SOURCE_DIR}/lib/libcore_tokenizers.dylib")
-    endif()
-  endif()
-
-  if(WIN32 AND X86_64)
-    list(APPEND extra_libs "${fast_tokenezers_libs_dir}/icudt70.dll"
-                           "${fast_tokenezers_libs_dir}/icuuc70$<$<CONFIG:Debug>:${CMAKE_DEBUG_POSTFIX}>.dll")
-  endif()
-
-  if(extra_libs)
-    # post build steps
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${extra_libs} $<TARGET_FILE_DIR:${TARGET_NAME}>)
-  endif()
-endif()
-
 #
 # Set install RPATH
 #
@@ -491,12 +306,6 @@ install(TARGETS ${TARGET_NAME}
         LIBRARY DESTINATION ${OPENVINO_TOKENIZERS_INSTALL_LIBDIR} COMPONENT openvino_tokenizers
         RUNTIME DESTINATION ${OPENVINO_TOKENIZERS_INSTALL_BINDIR} COMPONENT openvino_tokenizers)
 
-if(BUILD_FAST_TOKENIZERS)
-  install(TARGETS core_tokenizers
-    LIBRARY DESTINATION ${OPENVINO_TOKENIZERS_INSTALL_LIBDIR} COMPONENT openvino_tokenizers
-    RUNTIME DESTINATION ${OPENVINO_TOKENIZERS_INSTALL_BINDIR} COMPONENT openvino_tokenizers)
-endif()
-
 if(extra_libs)
   if(WIN32)
     set(extra_libs_location ${OPENVINO_TOKENIZERS_INSTALL_BINDIR})

From 082064c6b05bdf22ddfa6664369015e06c9612f1 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 10 Jan 2025 11:16:27 +0000
Subject: [PATCH 17/30] Delete FastTokenizer related patches

---
 src/icu_filter_en.json            |  8 ---
 src/patches/fast_tokenizers.patch | 72 -----------------------
 src/patches/gflags.patch          | 15 -----
 src/patches/glog.patch            | 47 ---------------
 src/patches/icu.patch             | 95 -------------------------------
 5 files changed, 237 deletions(-)
 delete mode 100644 src/icu_filter_en.json
 delete mode 100644 src/patches/fast_tokenizers.patch
 delete mode 100644 src/patches/gflags.patch
 delete mode 100644 src/patches/glog.patch
 delete mode 100644 src/patches/icu.patch

diff --git a/src/icu_filter_en.json b/src/icu_filter_en.json
deleted file mode 100644
index d7406489..00000000
--- a/src/icu_filter_en.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "localeFilter": {
-    "filterType": "language",
-    "includelist": [
-      "en"
-    ]
-  }
-}
diff --git a/src/patches/fast_tokenizers.patch b/src/patches/fast_tokenizers.patch
deleted file mode 100644
index 6629b4af..00000000
--- a/src/patches/fast_tokenizers.patch
+++ /dev/null
@@ -1,72 +0,0 @@
-diff --git a/fast_tokenizer/cmake/ByproductsICU.cmake b/fast_tokenizer/cmake/ByproductsICU.cmake
-index 3b68f082..6ae7e8f0 100644
---- a/fast_tokenizer/cmake/ByproductsICU.cmake
-+++ b/fast_tokenizer/cmake/ByproductsICU.cmake
-@@ -15,14 +15,14 @@
- # See the License for the specific language governing permissions and
- # limitations under the License.
- 
--function(GetICUByproducts ICU_PATH ICU_LIB_VAR ICU_INCLUDE_VAR ICU_BASE_NAMES_VAR)
-+function(GetICUByproducts ICU_PATH ICU_LIB_VAR ICU_INCLUDE_VAR ICU_BASE_NAMES_VAR ICU_LIB_POSTFIX)
-     # include directory
-     set(${ICU_INCLUDE_VAR} "${ICU_PATH}/include" PARENT_SCOPE)
--    
-+
-     if (WIN32)
-         # windows basenames and pre/suffixes
-         set(ICU_LIB_BASE_NAMES dt in io tu uc)
--        
-+
-         set(ICU_SHARED_PREFIX "lib")
-         set(ICU_STATIC_PREFIX "")
-         set(ICU_SHARED_SUFFIX ".dll.a")
-@@ -39,9 +39,14 @@ function(GetICUByproducts ICU_PATH ICU_LIB_VAR ICU_INCLUDE_VAR ICU_BASE_NAMES_VA
-     endif()
-     # add static and shared libs to the libraries variable
-     foreach(ICU_BASE_NAME ${ICU_LIB_BASE_NAMES})
--        set(ICU_SHARED_LIB "${ICU_PATH}/${ICU_INSTALL_LIB}/${ICU_SHARED_PREFIX}icu${ICU_BASE_NAME}${ICU_SHARED_SUFFIX}")
--        set(ICU_STATIC_LIB "${ICU_PATH}/${ICU_INSTALL_LIB}/${ICU_STATIC_PREFIX}icu${ICU_BASE_NAME}${ICU_STATIC_SUFFIX}")
--        
-+        if(ICU_BASE_NAME STREQUAL "dt")
-+            set(ICU_NAME "${ICU_BASE_NAME}")
-+        else()
-+            set(ICU_NAME "${ICU_BASE_NAME}${ICU_LIB_POSTFIX}")
-+        endif()
-+        set(ICU_SHARED_LIB "${ICU_PATH}/${ICU_INSTALL_LIB}/${ICU_SHARED_PREFIX}icu${ICU_NAME}${ICU_SHARED_SUFFIX}")
-+        set(ICU_STATIC_LIB "${ICU_PATH}/${ICU_INSTALL_LIB}/${ICU_STATIC_PREFIX}icu${ICU_NAME}${ICU_STATIC_SUFFIX}")
-+
-         if (ICU_STATIC)
-             list(APPEND ${ICU_LIB_VAR} ${ICU_STATIC_LIB})
-         else()
-
-diff --git a/fast_tokenizer/CMakeLists.txt b/fast_tokenizer/CMakeLists.txt
-index ce238239..39f34fa4 100644
---- a/fast_tokenizer/CMakeLists.txt
-+++ b/fast_tokenizer/CMakeLists.txt
-@@ -51,7 +51,7 @@ else()
-     set(CMAKE_CXX_STANDARD 11)
- endif()
- 
--IF(WIN32)
-+IF(WIN32 AND MSVC_STATIC_CRT)
- # Need to add flags for windows
- foreach(
-     flag_var
-@@ -126,7 +126,7 @@ set(${flag_var}
- set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
- endforeach()
- 
--ELSE(WIN32)
-+ELSE()
-     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC")
-     IF (NOT APPLE)
-       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ldl")
-@@ -137,7 +137,7 @@ ELSE(WIN32)
-       ENDIF()
-     ENDIF()
-     set (PUBLIC_DEPEND_LIBS ${CMAKE_DL_LIBS})
--ENDIF(WIN32)
-+ENDIF()
- 
- set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR})
- set(TOKENIZERS_INSTALL_INCLUDE_DIR ${PROJECT_SOURCE_DIR})
diff --git a/src/patches/gflags.patch b/src/patches/gflags.patch
deleted file mode 100644
index 0217c11c..00000000
--- a/src/patches/gflags.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff --git a/fast_tokenizer/cmake/external/gflags.cmake b/fast_tokenizer/cmake/external/gflags.cmake
-index df5b3642..fcf385d8 100644
---- a/fast_tokenizer/cmake/external/gflags.cmake
-+++ b/fast_tokenizer/cmake/external/gflags.cmake
-@@ -23,8 +23,8 @@ IF(WIN32)
-   set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
- ELSE(WIN32)
-   set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
--  set(BUILD_COMMAND $(MAKE) --silent)
--  set(INSTALL_COMMAND $(MAKE) install)
-+  set(BUILD_COMMAND ${CMAKE_COMMAND} --build .)
-+  set(INSTALL_COMMAND ${CMAKE_COMMAND} --install .)
- ENDIF(WIN32)
- 
- INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
diff --git a/src/patches/glog.patch b/src/patches/glog.patch
deleted file mode 100644
index 6e40a08e..00000000
--- a/src/patches/glog.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-diff --git a/fast_tokenizer/cmake/external/glog.cmake b/fast_tokenizer/cmake/external/glog.cmake
-index 2afc3960..fc2b21ce 100644
---- a/fast_tokenizer/cmake/external/glog.cmake
-+++ b/fast_tokenizer/cmake/external/glog.cmake
-@@ -21,17 +21,29 @@ SET(GLOG_REPOSITORY ${GIT_URL}/google/glog.git)
- SET(GLOG_TAG        v0.4.0)
- 
- IF(WIN32)
--  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog library." FORCE)
-+  SET(GLOG_LIBRARIES_RELEASE "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog release library." FORCE)
-+  SET(GLOG_LIBRARIES_DEBUG "${GLOG_INSTALL_DIR}/lib/glogd.lib" CACHE FILEPATH "glog debug library." FORCE)
-   SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
-   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
- ELSE(WIN32)
--  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
-+  SET(GLOG_LIBRARIES_RELEASE "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog release library." FORCE)
-+  SET(GLOG_LIBRARIES_DEBUG "${GLOG_INSTALL_DIR}/lib/libglogd.a" CACHE FILEPATH "glog release library." FORCE)
-   SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
- ENDIF(WIN32)
- 
-+if(GENERATOR_IS_MULTI_CONFIG_VAR)
-+  set(GLOG_LIBRARIES "$<CONFIG:Debug>:${GLOG_LIBRARIES_DEBUG} $<CONFIG:Release>:${GLOG_LIBRARIES_RELEASE}")
-+else()
-+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-+    set(GLOG_LIBRARIES "${GLOG_LIBRARIES_DEBUG}")
-+  else()
-+    set(GLOG_LIBRARIES "${GLOG_LIBRARIES_RELEASE}")
-+  endif()
-+endif()
-+
- INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
- 
--IF(ANDROID)    
-+IF(ANDROID)
- set(CROSS_COMPILE_CMAKE_ARGS
-     "-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}"
-     "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
-@@ -112,6 +124,7 @@ ExternalProject_Add(
- ENDIF()
- 
- ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
--SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
-+SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION "${GLOG_LIBRARIES_RELEASE}")
-+SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION_DEBUG "${GLOG_LIBRARIES_DEBUG}")
- ADD_DEPENDENCIES(glog extern_glog gflags)
- LINK_LIBRARIES(glog)
-\ No newline at end of file
diff --git a/src/patches/icu.patch b/src/patches/icu.patch
deleted file mode 100644
index 203d6688..00000000
--- a/src/patches/icu.patch
+++ /dev/null
@@ -1,95 +0,0 @@
-diff --git a/fast_tokenizer/cmake/external/icu.cmake b/fast_tokenizer/cmake/external/icu.cmake
-index cd604d38..a949a156 100644
---- a/fast_tokenizer/cmake/external/icu.cmake
-+++ b/fast_tokenizer/cmake/external/icu.cmake
-@@ -50,11 +50,33 @@ set(HOST_ENV_CMAKE ${CMAKE_COMMAND} -E env
-         LDFLAGS=${HOST_LDFLAGS}
- )
- 
-+if(WIN32)
-+  set(CMAKE_DEBUG_POSTFIX "d")
-+else()
-+  set(CMAKE_DEBUG_POSTFIX "")
-+endif()
-+
- # predict host libraries
- set(ICU_STATIC TRUE)
--GetICUByproducts(${ICU_INSTALL_DIR} ICU_LIBRARIES ICU_INCLUDE_DIRS ICU_BASE_NAMES)
-+GetICUByproducts(${ICU_INSTALL_DIR} ICU_LIBRARIES_RELEASE ICU_INCLUDE_DIRS ICU_BASE_NAMES "")
-+GetICUByproducts(${ICU_INSTALL_DIR} ICU_LIBRARIES_DEBUG ICU_INCLUDE_DIRS ICU_BASE_NAMES "${CMAKE_DEBUG_POSTFIX}")
- INCLUDE_DIRECTORIES(${ICU_INCLUDE_DIRS})
- 
-+if(GENERATOR_IS_MULTI_CONFIG_VAR)
-+  set(ICU_LIBRARIES "$<CONFIG:Debug>:${ICU_LIBRARIES_DEBUG} $<CONFIG:Release>:${ICU_LIBRARIES_RELEASE}")
-+  set(ICU_CONFIGURE_FLAGS $<$<CONFIG:Debug>:"--enable-debug">$<$<CONFIG:Release>:"--enable-release">)
-+  set(ICU_BUILD_TYPE $<CONFIG>)
-+else()
-+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-+    set(ICU_LIBRARIES "${ICU_LIBRARIES_DEBUG}")
-+    set(ICU_CONFIGURE_FLAGS "--enable-debug")
-+  else()
-+    set(ICU_LIBRARIES "${ICU_LIBRARIES_RELEASE}")
-+    set(ICU_CONFIGURE_FLAGS "--enable-release")
-+  endif()
-+  set(ICU_BUILD_TYPE ${CMAKE_BUILD_TYPE})
-+endif()
-+
- if(WIN32)
- ExternalProject_Add(
-         extern_icu
-@@ -65,7 +87,7 @@ ExternalProject_Add(
-         GIT_PROGRESS      1
-         PREFIX            ${ICU_PREFIX_DIR}
-         UPDATE_COMMAND    ""
--        CONFIGURE_COMMAND msbuild ..\\extern_icu\\icu4c\\source\\allinone\\allinone.sln /p:Configuration=Release /p:Platform=x64 /p:RuntimeLibrary=MT_StaticRelease /p:SkipUWP=true
-+        CONFIGURE_COMMAND msbuild ..\\extern_icu\\icu4c\\source\\allinone\\allinone.sln /p:Configuration=${ICU_BUILD_TYPE} /p:Platform=x64 /p:SkipUWP=true
-         BUILD_COMMAND ""
-         INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ../extern_icu/icu4c/include ${ICU_INSTALL_DIR}/include
-                      && ${CMAKE_COMMAND} -E copy_directory ../extern_icu/icu4c/lib64 ${ICU_INSTALL_DIR}/lib64
-@@ -81,7 +103,7 @@ ExternalProject_Add(
-         GIT_PROGRESS      1
-         PREFIX            ${ICU_PREFIX_DIR}
-         UPDATE_COMMAND    ""
--        CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "MacOSX/GCC" --enable-static --disable-shared --enable-rpath
-+        CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "MacOSX/GCC" ${ICU_CONFIGURE_FLAGS} --enable-static --enable-rpath
-         BUILD_COMMAND make -j4
-         INSTALL_COMMAND make install prefix="" DESTDIR=${ICU_INSTALL_DIR} install
-         BUILD_BYPRODUCTS ${ICU_LIBRARIES}
-@@ -98,7 +120,7 @@ ExternalProject_Add(
-         BUILD_COMMAND     ""
-         INSTALL_COMMAND
-           ${CMAKE_COMMAND} -E remove_directory ${ICU_INSTALL_DIR} &&
--          ${CMAKE_COMMAND} -E make_directory ${ICU_INSTALL_DIR} &&  
-+          ${CMAKE_COMMAND} -E make_directory ${ICU_INSTALL_DIR} &&
-           ${CMAKE_COMMAND} -E rename ${ICU_PREFIX_DIR}/src/extern_icu/lib/ ${ICU_INSTALL_DIR}/lib &&
-           ${CMAKE_COMMAND} -E copy_directory ${ICU_PREFIX_DIR}/src/extern_icu/include ${ICU_INSTALL_DIR}/include
-         BUILD_BYPRODUCTS ${ICU_LIBRARIES}
-@@ -113,22 +135,24 @@ ExternalProject_Add(
-         GIT_PROGRESS      1
-         PREFIX            ${ICU_PREFIX_DIR}
-         UPDATE_COMMAND    ""
--        CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "Linux/gcc" --enable-static --disable-shared --enable-rpath
-+        CONFIGURE_COMMAND ${HOST_ENV_CMAKE} ../extern_icu/icu4c/source/runConfigureICU "Linux" ${ICU_CONFIGURE_FLAGS} --enable-static --enable-rpath
-         BUILD_COMMAND make -j4
-         INSTALL_COMMAND make install prefix="" DESTDIR=${ICU_INSTALL_DIR} install
-         BUILD_BYPRODUCTS ${ICU_LIBRARIES}
- )
- endif()
- 
--list(LENGTH ICU_LIBRARIES ICU_LIB_LEN)
-+list(LENGTH ICU_LIBRARIES_RELEASE ICU_LIB_LEN)
- MATH(EXPR ICU_LIB_LEN "${ICU_LIB_LEN}-1")
- 
- # icui18n icudata icuuc icuio icutu
- foreach(ICU_IDX RANGE ${ICU_LIB_LEN})
--  list(GET ICU_LIBRARIES ${ICU_IDX} ICU_LIB)
-+  list(GET ICU_LIBRARIES_RELEASE ${ICU_IDX} ICU_LIB_RELEASE)
-+  list(GET ICU_LIBRARIES_DEBUG ${ICU_IDX} ICU_LIB_DEBUG)
-   list(GET ICU_BASE_NAMES ${ICU_IDX} ICU_BASE_NAME)
-   ADD_LIBRARY("icu${ICU_BASE_NAME}" STATIC IMPORTED GLOBAL)
--  SET_PROPERTY(TARGET "icu${ICU_BASE_NAME}" PROPERTY IMPORTED_LOCATION ${ICU_LIB})
-+  SET_PROPERTY(TARGET "icu${ICU_BASE_NAME}" PROPERTY IMPORTED_LOCATION ${ICU_LIB_RELEASE})
-+  SET_PROPERTY(TARGET "icu${ICU_BASE_NAME}" PROPERTY IMPORTED_LOCATION_DEBUG ${ICU_LIB_DEBUG})
-   ADD_DEPENDENCIES("icu${ICU_BASE_NAME}" extern_icu)
-   list(APPEND ICU_INTERFACE_LINK_LIBRARIES "icu${ICU_BASE_NAME}")
- endforeach()

From 7380898f41b94aeb163ef3578124f174cd0d9394 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 10 Jan 2025 11:58:10 +0000
Subject: [PATCH 18/30] Delete FastTokenizer build form CI

---
 .github/workflows/linux.yml | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 1295ce0a..593b0454 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -63,10 +63,9 @@ jobs:
 
 
   openvino_tokenizers_cpack:
-    name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }})
+    name: OpenVINO tokenizers cpack, BUILD_TYPE=${{ matrix.build_type }})
     strategy:
       matrix:
-        build_fast_tokenizers: [ON]
         build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package
     needs: [ openvino_download ]
     if: |
@@ -111,8 +110,7 @@ jobs:
         run: |
           apt-get update && apt install -y libicu-dev
           source ${INSTALL_DIR}/setupvars.sh
-          cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
-                -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+          cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
                 -S ${{ env.OPENVINO_TOKENIZERS_REPO }} \
                 -B ${{ env.BUILD_DIR }}
 
@@ -139,15 +137,13 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
-          name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
+          name: openvino_tokenizers_cpack_${{ matrix.build_type }}
           path: ${{ env.BUILD_DIR }}/*.tar.gz
           if-no-files-found: 'error'
 
   openvino_tokenizers_wheel:
-    name: OpenVINO tokenizers extension (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }})
-    strategy:
-      matrix:
-        build_fast_tokenizers: [ON, OFF]
+    name: OpenVINO tokenizers extension wheel
+
     needs: [ openvino_download ]
     if: |
       always() &&
@@ -190,7 +186,6 @@ jobs:
           apt-get update && apt install -y libicu-dev
           python -m pip wheel -v --no-deps --wheel-dir ${BUILD_DIR} \
               --config-settings=override=cross.arch="manylinux_2_31_x86_64" \
-              --config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
               ${{ needs.openvino_download.outputs.ov_wheel_source }} \
               ${OPENVINO_TOKENIZERS_REPO}
         env:
@@ -206,15 +201,12 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
-          name: openvino_tokenizers_wheel_${{ matrix.build_fast_tokenizers }}
+          name: openvino_tokenizers_wheel
           path: ${{ env.BUILD_DIR }}/*.whl
           if-no-files-found: 'error'
 
   openvino_tokenizers_tests:
-    name: OpenVINO tokenizers tests (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }})
-    strategy:
-      matrix:
-        build_fast_tokenizers: [ON, OFF]
+    name: OpenVINO tokenizers tests
     needs: [ openvino_download, openvino_tokenizers_wheel]
     if: always() && needs.openvino_tokenizers_wheel.result == 'success'
     timeout-minutes: 45
@@ -244,7 +236,7 @@ jobs:
       - name: Download tokenizers package
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
-          name: openvino_tokenizers_wheel_${{ matrix.build_fast_tokenizers }}
+          name: openvino_tokenizers_wheel
           path: ${{ env.INSTALL_DIR }}/ov_tokenizers
 
       - name: Download OpenVINO package

From 68d0300b0126899918d0ae636b57bd7ba6115da2 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 10 Jan 2025 13:18:39 +0000
Subject: [PATCH 19/30] Delete FastTokenizer build form CI

---
 .github/workflows/mac.yml     | 8 +++-----
 .github/workflows/windows.yml | 8 +++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index bec1cd7a..0f6a78b5 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -176,10 +176,9 @@ jobs:
           if-no-files-found: 'error'
 
   openvino_tokenizers_cpack:
-    name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }})
+    name: OpenVINO tokenizers cpack (BUILD_TYPE=${{ matrix.build_type }})
     strategy:
       matrix:
-        build_fast_tokenizers: [ON]
         build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package
     needs: [ openvino_download, openvino_build ]
     if: |
@@ -226,8 +225,7 @@ jobs:
       - name: CMake configure - tokenizers
         run: |
           source ${INSTALL_DIR}/setupvars.sh
-          cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
-                -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+          cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
                 -S ${{ env.OPENVINO_TOKENIZERS_REPO }} \
                 -B ${{ env.BUILD_DIR }}
 
@@ -254,7 +252,7 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
-          name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
+          name: openvino_tokenizers_cpack_${{ matrix.build_type }}
           path: ${{ env.BUILD_DIR }}/*.tar.gz
           if-no-files-found: 'error'
 
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index ce5b79b8..539229e1 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -47,10 +47,9 @@ jobs:
         revision: 'latest_available_commit'
 
   openvino_tokenizers_cpack:
-    name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }})
+    name: OpenVINO tokenizers cpack (BUILD_TYPE=${{ matrix.build_type }})
     strategy:
       matrix:
-        build_fast_tokenizers: [ON]
         build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package
     needs: [ openvino_download ]
     if: |
@@ -115,8 +114,7 @@ jobs:
         shell: pwsh
         run: |
           ${{ env.OV_INSTALL_DIR }}/setupvars.ps1
-          cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" `
-                -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} `
+          cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} `
                 -S ${{ env.OPENVINO_TOKENIZERS_REPO }} `
                 -B ${{ env.BUILD_DIR }}
         env:
@@ -149,7 +147,7 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
-          name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
+          name: openvino_tokenizers_cpack_${{ matrix.build_type }}
           path: ${{ env.BUILD_DIR }}/*.zip
           if-no-files-found: 'error'
 

From fc094a027d3798fcbe113790235c51300f50cb58 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 10 Jan 2025 14:45:53 +0000
Subject: [PATCH 20/30] Delete FastTokenizer from Cmake

---
 pyproject.toml     |  1 +
 src/CMakeLists.txt | 64 +++++++++++++++++++---------------------------
 2 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 22ca6ba2..56cfbf5b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,7 @@ dependencies = [
 [project.optional-dependencies]
 transformers = [
     "transformers[sentencepiece] >= 4.36.0",
+    "jinja2",  # has to be installed for sentencepiece-based transformers tokenizers to work
     "tiktoken"
 ]
 dev = [
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c9277003..0788fabb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -56,9 +56,6 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 # That prohibits linkage with prebuilt libraries because they aren't compiled with _GLIBCXX_USE_CXX11_ABI=0.
 get_directory_property(OPENVINO_RUNTIME_COMPILE_DEFINITIONS COMPILE_DEFINITIONS)
 include(CMakeDependentOption)
-if("_GLIBCXX_USE_CXX11_ABI=0" IN_LIST OPENVINO_RUNTIME_COMPILE_DEFINITIONS)
-  set(USE_ABI0 ON CACHE BOOL "Set -D_GLIBCXX_USE_CXX11_ABI to 0 for fast_tokenizers")
-endif()
 
 #
 # Compile flags
@@ -97,44 +94,35 @@ endif()
 
 include(FetchContent)
 
-if(NOT USE_ABI0)
-  # for ABI=0 case, we have to build from sources
-  find_package(sentencepiece QUIET)
+FetchContent_Declare(
+sentencepiece
+URL      https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz
+URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886
+)
+FetchContent_GetProperties(sentencepiece)
+if(NOT sentencepiece_POPULATED)
+if(DEFINED ENV{CONDA_BUILD_SYSROOT})
+    set(openvino_installed_from_conda ON)
+    # OpenVINO conda package dynamically linked with external protobuf,
+    # and we need to link sentencepiece with external protobuf too.
+    set(CMAKE_FIND_PACKAGE_PREFER_CONFIG ON)
+    set(protobuf_MODULE_COMPATIBLE ON CACHE BOOL "Protobuf module compatible")
 endif()
-
-if(sentencepiece_FOUND)
-  find_package(absl REQUIRED)
+if(openvino_installed_from_conda AND NOT WIN32)
+    set(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "")
+    set(SPM_PROTOBUF_PROVIDER "package" CACHE STRING "")
+    set(SPM_ABSL_PROVIDER "package" CACHE STRING "")
 else()
-  FetchContent_Declare(
-    sentencepiece
-    URL      https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz
-    URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886
-  )
-  FetchContent_GetProperties(sentencepiece)
-  if(NOT sentencepiece_POPULATED)
-    if(DEFINED ENV{CONDA_BUILD_SYSROOT})
-        set(openvino_installed_from_conda ON)
-        # OpenVINO conda package dynamically linked with external protobuf,
-        # and we need to link sentencepiece with external protobuf too.
-        set(CMAKE_FIND_PACKAGE_PREFER_CONFIG ON)
-        set(protobuf_MODULE_COMPATIBLE ON CACHE BOOL "Protobuf module compatible")
-    endif()
-    if(openvino_installed_from_conda AND NOT WIN32)
-        set(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "")
-        set(SPM_PROTOBUF_PROVIDER "package" CACHE STRING "")
-        set(SPM_ABSL_PROVIDER "package" CACHE STRING "")
-    else()
-        set(SPM_USE_BUILTIN_PROTOBUF ON CACHE BOOL "")
-        set(SPM_PROTOBUF_PROVIDER "internal" CACHE STRING "")
-        set(SPM_ABSL_PROVIDER "internal" CACHE STRING "")
-    endif()
+    set(SPM_USE_BUILTIN_PROTOBUF ON CACHE BOOL "")
+    set(SPM_PROTOBUF_PROVIDER "internal" CACHE STRING "")
+    set(SPM_ABSL_PROVIDER "internal" CACHE STRING "")
+endif()
 
-    set(SPM_ENABLE_SHARED OFF CACHE BOOL "")
-    set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "")
-    set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile")
-    FetchContent_Populate(sentencepiece)
-    add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL)
-  endif()
+set(SPM_ENABLE_SHARED OFF CACHE BOOL "")
+set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "")
+set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile")
+FetchContent_Populate(sentencepiece)
+add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL)
 endif()
 
 function(ov_tokenizers_link_sentencepiece TARGET_NAME)

From 72b064633348d626e34ab5b10bacdc526fe4a1bf Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 10 Jan 2025 14:47:05 +0000
Subject: [PATCH 21/30] Delete FastTokenizer from Cmake

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 56cfbf5b..edfde65a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,6 @@ dependencies = [
 [project.optional-dependencies]
 transformers = [
     "transformers[sentencepiece] >= 4.36.0",
-    "jinja2",  # has to be installed for sentencepiece-based transformers tokenizers to work
     "tiktoken"
 ]
 dev = [
@@ -48,6 +47,7 @@ dev = [
     "pytest",
     "pytest_harvest",
     "pandas",
+    "jinja2",
     "openvino_tokenizers[transformers]"
 ]
 benchmark = [

From 4eb7dd065e1cd0f6cecbb015ca862efddb23fa06 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Mon, 13 Jan 2025 16:34:54 +0100
Subject: [PATCH 22/30] use custom icu

---
 src/CMakeLists.txt | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0788fabb..ca4b8e04 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,6 +24,8 @@ if(POLICY CMP0169)
   cmake_policy(SET CMP0169 OLD)
 endif()
 
+option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF)
+
 function(ov_tokenizers_set_flags flags replace_value replace_pattern)
   foreach(flag ${flags})
     if(${flag} MATCHES "${replace_pattern}")
@@ -94,10 +96,32 @@ endif()
 
 include(FetchContent)
 
+if (ENABLE_SYSTEM_ICU)
+  message(STATUS "Using system-installed ICU.")
+else()
+  if(UNIX)
+    FetchContent_Declare(
+      ICU
+      URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Ubuntu-20.04-x64.tgz
+      URL_HASH SHA256=a8134e9f8a68d33600749601e143e553b5cb48c217c8941dbb9ef478fac420dd
+    )
+  elseif(WIN32)
+    FetchContent_Declare(
+      ICU
+      URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Win64-MSVC2019.zip
+      URL_HASH SHA256=af6b585e49d90d39ae9d3fe298b7f56983931706a5e49d4bce675c6a499124e5
+    )
+  endif()
+  FetchContent_MakeAvailable(ICU)
+  set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local")
+  list(PREPEND CMAKE_PREFIX_PATH "${ICU_DIR}")
+  message(STATUS "Using prebuilt ICU from ${ICU_DIR}.")
+endif()
+
 FetchContent_Declare(
-sentencepiece
-URL      https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz
-URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886
+  sentencepiece
+  URL      https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz
+  URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886
 )
 FetchContent_GetProperties(sentencepiece)
 if(NOT sentencepiece_POPULATED)

From cafaf0300cfd831ebfb7668f8f320352c65a9d82 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Mon, 13 Jan 2025 17:12:55 +0100
Subject: [PATCH 23/30] filter supported targets

---
 src/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 743b07e6..f27d3c38 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,8 +24,6 @@ if(POLICY CMP0169)
   cmake_policy(SET CMP0169 OLD)
 endif()
 
-option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF)
-
 function(ov_tokenizers_set_flags flags replace_value replace_pattern)
   foreach(flag ${flags})
     if(${flag} MATCHES "${replace_pattern}")
@@ -96,7 +94,10 @@ endif()
 
 include(FetchContent)
 
-if (ENABLE_SYSTEM_ICU)
+option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF)
+
+# There are no prebuilt ICU packages for macOS and Linux arm64
+if (ENABLE_SYSTEM_ICU OR APPLE OR NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
   message(STATUS "Using system-installed ICU.")
 else()
   if(UNIX)
@@ -113,9 +114,8 @@ else()
     )
   endif()
   FetchContent_MakeAvailable(ICU)
-  set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local")
-  list(PREPEND CMAKE_PREFIX_PATH "${ICU_DIR}")
-  message(STATUS "Using prebuilt ICU from ${ICU_DIR}.")
+  set(ICU_ROOT "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local")
+  message(STATUS "Using prebuilt ICU from ${ICU_ROOT}.")
 endif()
 
 FetchContent_Declare(

From deb6873ec75954ae4e45a7a7899e05310518fdaf Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Mon, 13 Jan 2025 17:14:13 +0100
Subject: [PATCH 24/30] removed tmp solution

---
 .github/workflows/linux.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 3db68e17..22bf6a7b 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -108,7 +108,6 @@ jobs:
 
       - name: CMake configure - tokenizers
         run: |
-          apt-get update && apt install -y libicu-dev
           source ${INSTALL_DIR}/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
                 -S ${{ env.OPENVINO_TOKENIZERS_REPO }} \
@@ -183,7 +182,6 @@ jobs:
 
       - name: Build tokenizers wheel
         run: |
-          apt-get update && apt install -y libicu-dev
           python -m pip wheel -v --no-deps --wheel-dir ${BUILD_DIR} \
               --config-settings=override=cross.arch="manylinux_2_31_x86_64" \
               ${{ needs.openvino_download.outputs.ov_wheel_source }} \

From 0e1365874baf8d88e30e6710f31f642f0534b2ef Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Mon, 13 Jan 2025 17:30:37 +0100
Subject: [PATCH 25/30] brew icu4c

---
 .github/workflows/mac.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 72a6b012..beae1a50 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -84,7 +84,7 @@ jobs:
       #
 
       - name: Install build dependencies
-        run: brew install coreutils ninja
+        run: brew install coreutils ninja icu4c
 
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0

From ac21acded7cec9b1c26122e99ac57da83487eaf5 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Tue, 14 Jan 2025 08:43:06 +0100
Subject: [PATCH 26/30] install icu4c

---
 .github/workflows/mac.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index beae1a50..2c3e9ffe 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -84,7 +84,7 @@ jobs:
       #
 
       - name: Install build dependencies
-        run: brew install coreutils ninja icu4c
+        run: brew install coreutils ninja
 
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
@@ -220,7 +220,7 @@ jobs:
       # Build
       #
       - name: Install build dependencies
-        run: brew install coreutils ninja
+        run: brew install coreutils ninja icu4c
 
       - name: CMake configure - tokenizers
         run: |
@@ -312,7 +312,7 @@ jobs:
       #
 
       - name: Install build dependencies
-        run: brew install coreutils ninja
+        run: brew install coreutils ninja icu4c
 
       #
       # Build

From a9c5b389e007d1d1bb5e72c042e3956618004d60 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Tue, 14 Jan 2025 08:52:05 +0100
Subject: [PATCH 27/30] fixed arch detection

---
 src/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f27d3c38..4ffaa99f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -95,9 +95,9 @@ endif()
 include(FetchContent)
 
 option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF)
-
+set(SUPPORTED_ARCHS "X86;X86_64")
 # There are no prebuilt ICU packages for macOS and Linux arm64
-if (ENABLE_SYSTEM_ICU OR APPLE OR NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
+if (ENABLE_SYSTEM_ICU OR APPLE OR NOT OV_HOST_ARCH IN_LIST SUPPORTED_ARCHS) 
   message(STATUS "Using system-installed ICU.")
 else()
   if(UNIX)

From e3eb2fd209ac43c5af9eacbff34927a42986a526 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Tue, 14 Jan 2025 09:28:40 +0100
Subject: [PATCH 28/30] fixed win subpath

---
 src/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4ffaa99f..173be66d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -106,16 +106,18 @@ else()
       URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Ubuntu-20.04-x64.tgz
       URL_HASH SHA256=a8134e9f8a68d33600749601e143e553b5cb48c217c8941dbb9ef478fac420dd
     )
+    set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local")
   elseif(WIN32)
     FetchContent_Declare(
       ICU
       URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Win64-MSVC2019.zip
       URL_HASH SHA256=af6b585e49d90d39ae9d3fe298b7f56983931706a5e49d4bce675c6a499124e5
     )
+    set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src")
   endif()
   FetchContent_MakeAvailable(ICU)
-  set(ICU_ROOT "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local")
-  message(STATUS "Using prebuilt ICU from ${ICU_ROOT}.")
+  list(PREPEND CMAKE_PREFIX_PATH "${ICU_DIR}")
+  message(STATUS "Using prebuilt ICU from ${ICU_DIR}.")
 endif()
 
 FetchContent_Declare(

From 60ec8e480f4c3286ada324ce0a9d71d388c770f7 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Wed, 15 Jan 2025 17:00:24 +0100
Subject: [PATCH 29/30] build from sources

---
 cmake/external/icu.cmake | 80 ++++++++++++++++++++++++++++++++++++++++
 src/CMakeLists.txt       | 27 +++-----------
 2 files changed, 85 insertions(+), 22 deletions(-)
 create mode 100644 cmake/external/icu.cmake

diff --git a/cmake/external/icu.cmake b/cmake/external/icu.cmake
new file mode 100644
index 00000000..5b3ec1bd
--- /dev/null
+++ b/cmake/external/icu.cmake
@@ -0,0 +1,80 @@
+include(FetchContent)
+
+set(THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/_deps/icu)
+set(ICU_SOURCE_DIR  ${THIRD_PARTY_PATH}/icu-src)
+set(ICU_BINARY_DIR  ${THIRD_PARTY_PATH}/icu-build)
+SET(ICU_INSTALL_DIR ${THIRD_PARTY_PATH}/icu-install)
+
+set(HOST_ENV_CMAKE ${CMAKE_COMMAND} -E env
+        CC=${CMAKE_C_COMPILER}
+        CXX=${CMAKE_CXX_COMPILER}
+        CFLAGS=${CMAKE_C_FLAGS}
+        CXXFLAGS=${CMAKE_CXX_FLAGS}
+        LDFLAGS=${CMAKE_MODULE_LINKER_FLAGS}
+)
+
+if(GENERATOR_IS_MULTI_CONFIG_VAR)
+  set(ICU_CONFIGURE_FLAGS $<$<CONFIG:Debug>:"--enable-debug">$<$<CONFIG:Release>:"--enable-release">)
+  set(ICU_BUILD_TYPE $<CONFIG>)
+else()
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(ICU_CONFIGURE_FLAGS "--enable-debug")
+  else()
+    set(ICU_CONFIGURE_FLAGS "--enable-release")
+  endif()
+  set(ICU_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+endif()
+
+set(FETCHCONTENT_QUIET FALSE)
+# Fetch and build ICU
+FetchContent_Declare(
+    ICU
+    URL https://github.com/unicode-org/icu/archive/refs/tags/release-70-1.tar.gz
+    URL_HASH SHA256=f30d670bdc03ba999638a2d2511952ab94adf204d0e14898666f2e0cacb7fef1
+    SOURCE_DIR ${ICU_SOURCE_DIR}
+    BINARY_DIR ${ICU_BINARY_DIR}
+    DOWNLOAD_EXTRACT_TIMESTAMP TRUE
+)
+
+FetchContent_MakeAvailable(ICU)
+
+if(NOT ICU_POPULATED)
+    # Configure the ICU build
+    message(STATUS "Configuring ICU...")
+    execute_process(
+        COMMAND ${ICU_SOURCE_DIR}/icu4c/source/runConfigureICU Linux --prefix ${ICU_INSTALL_DIR} ${ICU_CONFIGURE_FLAGS}
+            --disable-tests
+            --disable-samples
+            --disable-tools
+            --disable-extras 
+            --disable-icuio
+            --disable-draft
+        WORKING_DIRECTORY ${ICU_BINARY_DIR}
+    )
+    message(STATUS "Building ICU...")
+    execute_process(
+        COMMAND make -j${CMAKE_JOB_POOL_SIZE}
+        WORKING_DIRECTORY ${ICU_BINARY_DIR}
+    )
+    message(STATUS "Installing ICU...")
+    execute_process(
+        COMMAND make install
+        WORKING_DIRECTORY ${ICU_BINARY_DIR}
+    )
+endif()
+# Manually set ICU include and library directories
+set(ICU_ROOT ${ICU_INSTALL_DIR})
+
+if(WIN32)
+    set(SHARED_LIB_EXT "*.dll")
+elseif(APPLE)
+    set(SHARED_LIB_EXT "*.dylib")
+else()
+    set(SHARED_LIB_EXT "*.so")
+endif()
+
+install(
+    DIRECTORY ${ICU_INSTALL_DIR}/lib/
+    DESTINATION $<TARGET_FILE_DIR:${TARGET_NAME}>
+    FILES_MATCHING PATTERN "${SHARED_LIB_EXT}"
+)
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 173be66d..e4c5d3b9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -95,29 +95,12 @@ endif()
 include(FetchContent)
 
 option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF)
-set(SUPPORTED_ARCHS "X86;X86_64")
-# There are no prebuilt ICU packages for macOS and Linux arm64
-if (ENABLE_SYSTEM_ICU OR APPLE OR NOT OV_HOST_ARCH IN_LIST SUPPORTED_ARCHS) 
+
+if(ENABLE_SYSTEM_ICU) 
   message(STATUS "Using system-installed ICU.")
 else()
-  if(UNIX)
-    FetchContent_Declare(
-      ICU
-      URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Ubuntu-20.04-x64.tgz
-      URL_HASH SHA256=a8134e9f8a68d33600749601e143e553b5cb48c217c8941dbb9ef478fac420dd
-    )
-    set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src/usr/local")
-  elseif(WIN32)
-    FetchContent_Declare(
-      ICU
-      URL https://github.com/unicode-org/icu/releases/download/release-70-1/icu4c-70_1-Win64-MSVC2019.zip
-      URL_HASH SHA256=af6b585e49d90d39ae9d3fe298b7f56983931706a5e49d4bce675c6a499124e5
-    )
-    set(ICU_DIR "${CMAKE_BINARY_DIR}/_deps/icu-src")
-  endif()
-  FetchContent_MakeAvailable(ICU)
-  list(PREPEND CMAKE_PREFIX_PATH "${ICU_DIR}")
-  message(STATUS "Using prebuilt ICU from ${ICU_DIR}.")
+  message(STATUS "ICU not found, building from source...")
+  include(${CMAKE_SOURCE_DIR}/cmake/external/icu.cmake)
 endif()
 
 FetchContent_Declare(
@@ -334,7 +317,7 @@ install(FILES "${openvino_tokenizers_SOURCE_DIR}/LICENSE"
               "${openvino_tokenizers_SOURCE_DIR}/README.md"
         DESTINATION "docs/openvino_tokenizers"
         COMPONENT openvino_tokenizers_docs)
-
+        
 #
 # Cpack configuration
 #

From d971aacc66190ac083b8421ad057a3603a453aaa Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Fri, 17 Jan 2025 15:20:49 +0100
Subject: [PATCH 30/30] test commit

---
 CMakeLists.txt     |  34 ++++++++
 src/CMakeLists.txt | 202 +++++++++++++++++++++++----------------------
 2 files changed, 137 insertions(+), 99 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc277fd4..a5765b02 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,39 @@ else()
   set(BUILD_TYPE ${CMAKE_BUILD_TYPE})
 endif()
 
+# Put binaries at the top level for NPM package
+if(CPACK_GENERATOR STREQUAL "NPM")
+  set(OPENVINO_TOKENIZERS_INSTALL_LIBDIR .)
+  set(OPENVINO_TOKENIZERS_INSTALL_BINDIR .)
+else()
+  # - Windows: `<openvino_dir>\runtime\bin\intel64\Release\`
+  # - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release`
+  # - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/`
+  # - Linux_x86: `<openvino_dir>/runtime/lib/intel64/`
+  # - Linux_arm64: `<openvino_dir>/runtime/lib/aarch64/`
+  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" OPENVINO_TOKENIZERS_INSTALL_DIR)
+  if(OPENVINO_TOKENIZERS_INSTALL_DIR MATCHES "amd64.*|x86_64.*|AMD64.*")
+    set(OPENVINO_TOKENIZERS_INSTALL_DIR intel64)
+  elseif(OPENVINO_TOKENIZERS_INSTALL_DIR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)")
+    if(APPLE)
+      set(OPENVINO_TOKENIZERS_INSTALL_DIR "arm64")
+    else()
+      set(OPENVINO_TOKENIZERS_INSTALL_DIR "aarch64")
+    endif()
+  elseif(OPENVINO_TOKENIZERS_INSTALL_DIR STREQUAL "x86_64" OR OPENVINO_TOKENIZERS_INSTALL_DIR STREQUAL "amd64"  # Windows detects Intel's 64-bit CPU as AMD64
+      OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64")
+    set(OPENVINO_TOKENIZERS_INSTALL_DIR intel64)
+  endif()
+
+  if(WIN32 OR APPLE)
+    set(OPENVINO_TOKENIZERS_INSTALL_DIR ${OPENVINO_TOKENIZERS_INSTALL_DIR}/${BUILD_TYPE})
+  endif()
+
+  set(OPENVINO_TOKENIZERS_INSTALL_BINDIR "runtime/bin/${OPENVINO_TOKENIZERS_INSTALL_DIR}" CACHE STRING "Destination for files installation of bin files - Windows dll")
+  set(OPENVINO_TOKENIZERS_INSTALL_LIBDIR "runtime/lib/${OPENVINO_TOKENIZERS_INSTALL_DIR}" CACHE STRING  "Destination for files installation of lib files")
+endif()
+
+
 project(openvino_tokenizers
         VERSION 2025.0.0.0
         DESCRIPTION "OpenVINO Tokenizers"
@@ -78,6 +111,7 @@ if(BUILD_CPP_EXTENSION)
     endif()
 
     add_subdirectory(src)
+    
 endif()
 
 # install python files
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e4c5d3b9..2f401170 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -96,78 +96,109 @@ include(FetchContent)
 
 option(ENABLE_SYSTEM_ICU "Enables use of system ICU" OFF)
 
-if(ENABLE_SYSTEM_ICU) 
-  message(STATUS "Using system-installed ICU.")
-else()
+# if(ENABLE_SYSTEM_ICU) 
+#   message(STATUS "Using system-installed ICU.")
+# else()
   message(STATUS "ICU not found, building from source...")
-  include(${CMAKE_SOURCE_DIR}/cmake/external/icu.cmake)
-endif()
-
-FetchContent_Declare(
-  sentencepiece
-  URL      https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz
-  URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886
-)
-FetchContent_GetProperties(sentencepiece)
-if(NOT sentencepiece_POPULATED)
-if(DEFINED ENV{CONDA_BUILD_SYSROOT})
-    set(openvino_installed_from_conda ON)
-    # OpenVINO conda package dynamically linked with external protobuf,
-    # and we need to link sentencepiece with external protobuf too.
-    set(CMAKE_FIND_PACKAGE_PREFER_CONFIG ON)
-    set(protobuf_MODULE_COMPATIBLE ON CACHE BOOL "Protobuf module compatible")
-endif()
-if(openvino_installed_from_conda AND NOT WIN32)
-    set(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "")
-    set(SPM_PROTOBUF_PROVIDER "package" CACHE STRING "")
-    set(SPM_ABSL_PROVIDER "package" CACHE STRING "")
-else()
-    set(SPM_USE_BUILTIN_PROTOBUF ON CACHE BOOL "")
-    set(SPM_PROTOBUF_PROVIDER "internal" CACHE STRING "")
-    set(SPM_ABSL_PROVIDER "internal" CACHE STRING "")
-endif()
-
-set(SPM_ENABLE_SHARED OFF CACHE BOOL "")
-set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "")
-set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile")
-FetchContent_Populate(sentencepiece)
-add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL)
-endif()
+  # set(CMAKE_FIND_DEBUG_MODE ON)
+  # include(${CMAKE_SOURCE_DIR}/cmake/external/icu.cmake)
+  set(THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/_deps/icu)
+  set(ICU_SOURCE_DIR ${THIRD_PARTY_PATH}/icu-src CACHE PATH "Path to ICU source directory")
+  set(ICU_BUILD_DIR ${THIRD_PARTY_PATH}/icu-build CACHE PATH "Path to ICU build directory")
+  set(ICU_INSTALL_DIR ${THIRD_PARTY_PATH}/icu-install CACHE PATH "Path to ICU build directory")
+  
+  include(ExternalProject)
+  set_property(GLOBAL PROPERTY EP_STEP_TARGETS_VERBOSE ON)
+  # Add ICU as an external project
+  ExternalProject_Add(
+      icu_external
+      # GIT_REPOSITORY "https://github.com/unicode-org/icu.git"
+      # GIT_TAG "release-70-1"
+      URL https://github.com/unicode-org/icu/archive/refs/tags/release-70-1.tar.gz
+      URL_HASH SHA256=f30d670bdc03ba999638a2d2511952ab94adf204d0e14898666f2e0cacb7fef1
+      PREFIX ${THIRD_PARTY_PATH}
+      SOURCE_DIR ${ICU_SOURCE_DIR}
+      BINARY_DIR ${ICU_BUILD_DIR}
+      INSTALL_DIR ${ICU_INSTALL_DIR}
+      CONFIGURE_COMMAND ${ICU_SOURCE_DIR}/icu4c/source/runConfigureICU Linux --prefix ${ICU_INSTALL_DIR} --disable-tests --disable-samples --disable-tools --disable-extras --disable-icuio --disable-draft --disable-icu-config
+      BUILD_COMMAND make -j${CMAKE_JOB_POOL_SIZE}
+      INSTALL_COMMAND make install
+      DOWNLOAD_EXTRACT_TIMESTAMP ON
+  )
+  
+  list(PREPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/Modules")
+  find_package(ICU COMPONENTS i18n data uc REQUIRED)
+# endif()
+  
+  include(${CMAKE_SOURCE_DIR}/cmake/external/sentencepiece.cmake)
+  
+# FetchContent_Declare(
+#   sentencepiece
+#   URL      https://github.com/google/sentencepiece/archive/d8f741853847553169444afc12c00f4bbff3e9ce.tar.gz
+#   URL_HASH SHA256=1cf6e0713ecd04d1dd3328fdd388aa89c8ebab518a15e0886b54eadd8d228886
+# )
+# FetchContent_GetProperties(sentencepiece)
+# if(NOT sentencepiece_POPULATED)
+#   if(DEFINED ENV{CONDA_BUILD_SYSROOT})
+#       set(openvino_installed_from_conda ON)
+#       # OpenVINO conda package dynamically linked with external protobuf,
+#       # and we need to link sentencepiece with external protobuf too.
+#       set(CMAKE_FIND_PACKAGE_PREFER_CONFIG ON)
+#       set(protobuf_MODULE_COMPATIBLE ON CACHE BOOL "Protobuf module compatible")
+#   endif()
+#   if(openvino_installed_from_conda AND NOT WIN32)
+#       set(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "")
+#       set(SPM_PROTOBUF_PROVIDER "package" CACHE STRING "")
+#       set(SPM_ABSL_PROVIDER "package" CACHE STRING "")
+#   else()
+#       set(SPM_USE_BUILTIN_PROTOBUF ON CACHE BOOL "")
+#       set(SPM_PROTOBUF_PROVIDER "internal" CACHE STRING "")
+#       set(SPM_ABSL_PROVIDER "internal" CACHE STRING "")
+#   endif()
+
+#   set(SPM_ENABLE_SHARED OFF CACHE BOOL "")
+#   set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "")
+#   set(SPM_ENABLE_NFKC_COMPILE ON CACHE BOOL "Enable NFKC compile")
+  
+#   FetchContent_Populate(sentencepiece)
+#   add_subdirectory(${sentencepiece_SOURCE_DIR} ${sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL)
+# endif()
 
 function(ov_tokenizers_link_sentencepiece TARGET_NAME)
-  if(sentencepiece_FOUND)
-    foreach(sp_target sentencepiece sentencepiece_train)
-      if(TARGET ${sp_target}-static)
-        # on Windows conda-forge builds sentencepiece as static library
-        target_link_libraries(${TARGET_NAME} PRIVATE ${sp_target}-static)
-      else()
-        target_link_libraries(${TARGET_NAME} PRIVATE ${sp_target})
-      endif()
-    endforeach()
-    target_link_libraries(${TARGET_NAME} PRIVATE absl::string_view absl::flat_hash_set)
-  else()
-    target_include_directories(${TARGET_NAME} SYSTEM PRIVATE
-      "${sentencepiece_SOURCE_DIR}/src/builtin_pb"
-      "${sentencepiece_SOURCE_DIR}/src"
-      "${sentencepiece_SOURCE_DIR}/third_party/protobuf-lite"
-      "${sentencepiece_SOURCE_DIR}/third_party/" # for libabseil
-      "${sentencepiece_SOURCE_DIR}"
-      "${sentencepiece_BINARY_DIR}")
-
-    foreach(sp_target sentencepiece-static sentencepiece_train-static)
-      if(CMAKE_CL_64)
-        target_compile_definitions(${sp_target} PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS)
-      endif()
-      # to propogate _GLIBCXX_USE_CXX11_ABI value
-      target_compile_definitions(${sp_target} PUBLIC $<TARGET_PROPERTY:openvino::runtime,INTERFACE_COMPILE_DEFINITIONS>)
-      target_link_libraries(${TARGET_NAME} PRIVATE ${sp_target})
-    endforeach(sp_target sentencepiece sentencepiece_train)
-
-    if(ANDROID)
-      # see https://github.com/protocolbuffers/protobuf/issues/2719#issuecomment-625400968
-      target_link_libraries(${TARGET_NAME} PRIVATE log)
+  foreach(sp_target sentencepiece sentencepiece_train)
+    if(TARGET ${sp_target}-static)
+      # on Windows conda-forge builds sentencepiece as static library
+      target_link_libraries(${TARGET_NAME} PRIVATE ${sp_target}-static)
+    else()
+      target_link_libraries(${TARGET_NAME} INTERFACE ${sp_target})
     endif()
+  endforeach()
+  target_link_libraries(${TARGET_NAME} INTERFACE absl::string_view absl::flat_hash_set)
+
+  target_include_directories(${TARGET_NAME} SYSTEM PRIVATE
+    "${SPM_SOURCE_DIR}/src/builtin_pb"
+    "${SPM_SOURCE_DIR}/src"
+    "${SPM_SOURCE_DIR}/third_party/protobuf-lite"
+    "${SPM_SOURCE_DIR}/third_party/" # for libabseil
+    "${SPM_SOURCE_DIR}"
+    "${SPM_BINARY_DIR}")
+  
+  target_link_libraries(${TARGET_NAME} INTERFACE sentencepiece::sentencepiece)
+
+  # foreach(sp_target sentencepiece-static sentencepiece_train-static)
+  #   if(CMAKE_CL_64)
+  #     target_compile_definitions(${sp_target} PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS)
+  #   endif()
+  #   # to propogate _GLIBCXX_USE_CXX11_ABI value
+  #   # target_compile_definitions(${sp_target} PUBLIC $<TARGET_PROPERTY:openvino::runtime,INTERFACE_COMPILE_DEFINITIONS>)
+  #   target_link_libraries(${TARGET_NAME} PRIVATE ${sp_target})
+  # endforeach(sp_target sentencepiece sentencepiece_train)
+
+  if(ANDROID)
+    # see https://github.com/protocolbuffers/protobuf/issues/2719#issuecomment-625400968
+    target_link_libraries(${TARGET_NAME} PRIVATE log)
   endif()
+
 endfunction()
 
 function(ov_tokenizers_build_static_re2)
@@ -241,6 +272,11 @@ set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_OPTIONS "${extra_flags}"
 target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_OPENVINO_EXTENSION_API)
 target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime openvino::threading)
 
+# Ensure ICU is built before your main project
+add_dependencies(${TARGET_NAME} sentencepiece)
+# Add ICU include and library directories to the target
+target_link_libraries(${TARGET_NAME} INTERFACE ICU::i18n ICU::uc ICU::data)
+
 #
 # Set install RPATH
 #
@@ -266,38 +302,6 @@ endif()
 # Installation
 #
 
-# Put binaries at the top level for NPM package
-if(CPACK_GENERATOR STREQUAL "NPM")
-  set(OPENVINO_TOKENIZERS_INSTALL_LIBDIR .)
-  set(OPENVINO_TOKENIZERS_INSTALL_BINDIR .)
-else()
-  # - Windows: `<openvino_dir>\runtime\bin\intel64\Release\`
-  # - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release`
-  # - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/`
-  # - Linux_x86: `<openvino_dir>/runtime/lib/intel64/`
-  # - Linux_arm64: `<openvino_dir>/runtime/lib/aarch64/`
-  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" OPENVINO_TOKENIZERS_INSTALL_DIR)
-  if(OPENVINO_TOKENIZERS_INSTALL_DIR MATCHES "amd64.*|x86_64.*|AMD64.*")
-    set(OPENVINO_TOKENIZERS_INSTALL_DIR intel64)
-  elseif(OPENVINO_TOKENIZERS_INSTALL_DIR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)")
-    if(APPLE)
-      set(OPENVINO_TOKENIZERS_INSTALL_DIR "arm64")
-    else()
-      set(OPENVINO_TOKENIZERS_INSTALL_DIR "aarch64")
-    endif()
-  elseif(OPENVINO_TOKENIZERS_INSTALL_DIR STREQUAL "x86_64" OR OPENVINO_TOKENIZERS_INSTALL_DIR STREQUAL "amd64"  # Windows detects Intel's 64-bit CPU as AMD64
-      OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64")
-    set(OPENVINO_TOKENIZERS_INSTALL_DIR intel64)
-  endif()
-
-  if(WIN32 OR APPLE)
-    set(OPENVINO_TOKENIZERS_INSTALL_DIR ${OPENVINO_TOKENIZERS_INSTALL_DIR}/${BUILD_TYPE})
-  endif()
-
-  set(OPENVINO_TOKENIZERS_INSTALL_BINDIR "runtime/bin/${OPENVINO_TOKENIZERS_INSTALL_DIR}" CACHE STRING "Destination for files installation of bin files - Windows dll")
-  set(OPENVINO_TOKENIZERS_INSTALL_LIBDIR "runtime/lib/${OPENVINO_TOKENIZERS_INSTALL_DIR}" CACHE STRING  "Destination for files installation of lib files")
-endif()
-
 # Installing the extension module
 install(TARGETS ${TARGET_NAME}
         LIBRARY DESTINATION ${OPENVINO_TOKENIZERS_INSTALL_LIBDIR} COMPONENT openvino_tokenizers