Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU build from sources #371

Draft
wants to merge 35 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
1dd6b66
Add unicode normalization layer tests
apaniukov Nov 26, 2024
afd4a60
WiP
apaniukov Nov 29, 2024
2f24fec
WiP
apaniukov Dec 20, 2024
08052c2
Switch Casefold and UnicodeNormalization to CharsMap
apaniukov Jan 8, 2025
f6c001b
Add unicode normalization layer tests
apaniukov Nov 26, 2024
472b163
WiP
apaniukov Nov 29, 2024
04fb20c
WiP
apaniukov Dec 20, 2024
ed1203f
Switch Casefold and UnicodeNormalization to CharsMap
apaniukov Jan 8, 2025
012fb8e
Update tests and fix custom charsmap support
apaniukov Jan 9, 2025
e3831ec
Merge remote-tracking branch 'origin/update-normalization' into updat…
apaniukov Jan 9, 2025
8092720
Ruff checks
apaniukov Jan 9, 2025
df34dee
Merge branch 'master' into update-normalization
apaniukov Jan 9, 2025
6a611f3
wip
apaniukov Jan 9, 2025
258f0f4
wip
apaniukov Jan 9, 2025
baf0e70
wip
apaniukov Jan 9, 2025
6177b81
Switch Off FastTokenizer
apaniukov Jan 10, 2025
68b7e4e
Delete torch from dependencies
apaniukov Jan 10, 2025
7244191
Delete FastTokenizer from cmake and readme
apaniukov Jan 10, 2025
082064c
Delete FastTokenizer related patches
apaniukov Jan 10, 2025
7380898
Delete FastTokenizer build form CI
apaniukov Jan 10, 2025
68d0300
Delete FastTokenizer build form CI
apaniukov Jan 10, 2025
fc094a0
Delete FastTokenizer from Cmake
apaniukov Jan 10, 2025
72b0646
Delete FastTokenizer from Cmake
apaniukov Jan 10, 2025
4eb7dd0
use custom icu
mryzhov Jan 13, 2025
9a85097
Merge branch 'master' into icu_build
mryzhov Jan 13, 2025
cafaf03
filter supported targets
mryzhov Jan 13, 2025
deb6873
removed tmp solution
mryzhov Jan 13, 2025
0e13658
brew icu4c
mryzhov Jan 13, 2025
ac21acd
install icu4c
mryzhov Jan 14, 2025
a9c5b38
fixed arch detection
mryzhov Jan 14, 2025
e3eb2fd
fixed win subpath
mryzhov Jan 14, 2025
02fc991
Merge branch 'master' into icu_build_sources
mryzhov Jan 15, 2025
60ec8e4
build from sources
mryzhov Jan 15, 2025
d971aac
test commit
mryzhov Jan 17, 2025
788ef1c
Merge branch 'icu_build_sources' of https://github.com/mryzhov/openvi…
mryzhov Jan 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 8 additions & 16 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,9 @@ jobs:


openvino_tokenizers_cpack:
name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }})
name: OpenVINO tokenizers cpack, BUILD_TYPE=${{ matrix.build_type }})
strategy:
matrix:
build_fast_tokenizers: [ON]
build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package
needs: [ openvino_download ]
if: |
Expand Down Expand Up @@ -110,8 +109,7 @@ jobs:
- name: CMake configure - tokenizers
run: |
source ${INSTALL_DIR}/setupvars.sh
cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-S ${{ env.OPENVINO_TOKENIZERS_REPO }} \
-B ${{ env.BUILD_DIR }}

Expand All @@ -138,15 +136,13 @@ jobs:
if: ${{ always() }}
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
name: openvino_tokenizers_cpack_${{ matrix.build_type }}
path: ${{ env.BUILD_DIR }}/*.tar.gz
if-no-files-found: 'error'

openvino_tokenizers_wheel:
name: OpenVINO tokenizers extension (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }})
strategy:
matrix:
build_fast_tokenizers: [ON, OFF]
name: OpenVINO tokenizers extension wheel

needs: [ openvino_download ]
if: |
always() &&
Expand Down Expand Up @@ -188,7 +184,6 @@ jobs:
run: |
python -m pip wheel -v --no-deps --wheel-dir ${BUILD_DIR} \
--config-settings=override=cross.arch="manylinux_2_31_x86_64" \
--config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
${{ needs.openvino_download.outputs.ov_wheel_source }} \
${OPENVINO_TOKENIZERS_REPO}
env:
Expand All @@ -204,15 +199,12 @@ jobs:
if: ${{ always() }}
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: openvino_tokenizers_wheel_${{ matrix.build_fast_tokenizers }}
name: openvino_tokenizers_wheel
path: ${{ env.BUILD_DIR }}/*.whl
if-no-files-found: 'error'

openvino_tokenizers_tests:
name: OpenVINO tokenizers tests (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }})
strategy:
matrix:
build_fast_tokenizers: [ON, OFF]
name: OpenVINO tokenizers tests
needs: [ openvino_download, openvino_tokenizers_wheel]
if: always() && needs.openvino_tokenizers_wheel.result == 'success'
timeout-minutes: 45
Expand Down Expand Up @@ -242,7 +234,7 @@ jobs:
- name: Download tokenizers package
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
name: openvino_tokenizers_wheel_${{ matrix.build_fast_tokenizers }}
name: openvino_tokenizers_wheel
path: ${{ env.INSTALL_DIR }}/ov_tokenizers

- name: Download OpenVINO package
Expand Down
12 changes: 5 additions & 7 deletions .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,9 @@ jobs:
if-no-files-found: 'error'

openvino_tokenizers_cpack:
name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }})
name: OpenVINO tokenizers cpack (BUILD_TYPE=${{ matrix.build_type }})
strategy:
matrix:
build_fast_tokenizers: [ON]
build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package
needs: [ openvino_download, openvino_build ]
if: |
Expand Down Expand Up @@ -221,13 +220,12 @@ jobs:
# Build
#
- name: Install build dependencies
run: brew install coreutils ninja
run: brew install coreutils ninja icu4c

- name: CMake configure - tokenizers
run: |
source ${INSTALL_DIR}/setupvars.sh
cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-S ${{ env.OPENVINO_TOKENIZERS_REPO }} \
-B ${{ env.BUILD_DIR }}

Expand All @@ -254,7 +252,7 @@ jobs:
if: ${{ always() }}
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
name: openvino_tokenizers_cpack_${{ matrix.build_type }}
path: ${{ env.BUILD_DIR }}/*.tar.gz
if-no-files-found: 'error'

Expand Down Expand Up @@ -314,7 +312,7 @@ jobs:
#

- name: Install build dependencies
run: brew install coreutils ninja
run: brew install coreutils ninja icu4c

#
# Build
Expand Down
8 changes: 3 additions & 5 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,9 @@ jobs:
revision: 'latest_available_commit'

openvino_tokenizers_cpack:
name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }})
name: OpenVINO tokenizers cpack (BUILD_TYPE=${{ matrix.build_type }})
strategy:
matrix:
build_fast_tokenizers: [ON]
build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package
needs: [ openvino_download ]
if: |
Expand Down Expand Up @@ -115,8 +114,7 @@ jobs:
shell: pwsh
run: |
${{ env.OV_INSTALL_DIR }}/setupvars.ps1
cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" `
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} `
cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} `
-S ${{ env.OPENVINO_TOKENIZERS_REPO }} `
-B ${{ env.BUILD_DIR }}
env:
Expand Down Expand Up @@ -149,7 +147,7 @@ jobs:
if: ${{ always() }}
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
name: openvino_tokenizers_cpack_${{ matrix.build_type }}
path: ${{ env.BUILD_DIR }}/*.zip
if-no-files-found: 'error'

Expand Down
34 changes: 34 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,39 @@ else()
set(BUILD_TYPE ${CMAKE_BUILD_TYPE})
endif()

# Put binaries at the top level for NPM package
if(CPACK_GENERATOR STREQUAL "NPM")
set(OPENVINO_TOKENIZERS_INSTALL_LIBDIR .)
set(OPENVINO_TOKENIZERS_INSTALL_BINDIR .)
else()
# - Windows: `<openvino_dir>\runtime\bin\intel64\Release\`
# - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release`
# - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/`
# - Linux_x86: `<openvino_dir>/runtime/lib/intel64/`
# - Linux_arm64: `<openvino_dir>/runtime/lib/aarch64/`
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" OPENVINO_TOKENIZERS_INSTALL_DIR)
if(OPENVINO_TOKENIZERS_INSTALL_DIR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(OPENVINO_TOKENIZERS_INSTALL_DIR intel64)
elseif(OPENVINO_TOKENIZERS_INSTALL_DIR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)")
if(APPLE)
set(OPENVINO_TOKENIZERS_INSTALL_DIR "arm64")
else()
set(OPENVINO_TOKENIZERS_INSTALL_DIR "aarch64")
endif()
elseif(OPENVINO_TOKENIZERS_INSTALL_DIR STREQUAL "x86_64" OR OPENVINO_TOKENIZERS_INSTALL_DIR STREQUAL "amd64" # Windows detects Intel's 64-bit CPU as AMD64
OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64")
set(OPENVINO_TOKENIZERS_INSTALL_DIR intel64)
endif()

if(WIN32 OR APPLE)
set(OPENVINO_TOKENIZERS_INSTALL_DIR ${OPENVINO_TOKENIZERS_INSTALL_DIR}/${BUILD_TYPE})
endif()

set(OPENVINO_TOKENIZERS_INSTALL_BINDIR "runtime/bin/${OPENVINO_TOKENIZERS_INSTALL_DIR}" CACHE STRING "Destination for files installation of bin files - Windows dll")
set(OPENVINO_TOKENIZERS_INSTALL_LIBDIR "runtime/lib/${OPENVINO_TOKENIZERS_INSTALL_DIR}" CACHE STRING "Destination for files installation of lib files")
endif()


project(openvino_tokenizers
VERSION 2025.0.0.0
DESCRIPTION "OpenVINO Tokenizers"
Expand Down Expand Up @@ -78,6 +111,7 @@ if(BUILD_CPP_EXTENSION)
endif()

add_subdirectory(src)

endif()

# install python files
Expand Down
71 changes: 0 additions & 71 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,77 +150,6 @@ make

After that, you can transfer all binaries from `build/src` to `<openvino_dir>` as described in the C++ installation instruction above.

### Reducing the ICU Data Size

By default, all available ICU locales are supported, which significantly increases the package size. To reduce the size of the ICU libraries included in your final package, follow these steps:

1. **Use the ICU Data Configuration File**:
- This file specifies which features and locales to include in a custom data bundle. You can find more information [here](https://unicode-org.github.io/icu/userguide/icu_data/buildtool.html#icu-data-configuration-file).

2. **Set the ICU Data Filter File as an Environment Variable**:
- **On Unix-like systems (Linux, macOS)**:
Set the `ICU_DATA_FILTER_FILE` environment variable to the path of your configuration file (`filters.json`):

```bash
export ICU_DATA_FILTER_FILE="filters.json"
```

- **On Windows**:
Set the `ICU_DATA_FILTER_FILE` environment variable using the Command Prompt or PowerShell:

**Command Prompt:**
```cmd
set ICU_DATA_FILTER_FILE=filters.json
```

**PowerShell:**
```powershell
$env:ICU_DATA_FILTER_FILE="filters.json"
```

3. **Create a Configuration File**:
- An example configuration file (`filters.json`) might look like this:

```json
{
"localeFilter": {
"filterType": "language",
"includelist": [
"en"
]
}
}
```

4. **Configure OpenVINO Tokenizers**:
- When building OpenVINO tokenizers, set the following CMake option during the project configuration:

```bash
-DBUILD_FAST_TOKENIZERS=ON
```
- Example for a pip installation path:
```bash
ICU_DATA_FILTER_FILE=</path/to/filters.json> pip install git+https://github.com/openvinotoolkit/openvino_tokenizers.git --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS=ON
```

By following these instructions, you can effectively reduce the size of the ICU libraries in your final package.

### Build OpenVINO Tokenizers without FastTokenizer Library

If a tokenizer doesn't use `CaseFold`, `UnicodeNormalization` or `Wordpiece` operations, you can drastically reduce package binary size by building OpenVINO Tokenizers without FastTokenizer dependency with this flag:

```bash
-DENABLE_FAST_TOKENIZERS=OFF
```

This option can also help with building for platform that is supported by FastTokenizer, for example `Android x86_64`.

Example for a pip installation path:
```bash

pip install git+https://github.com/openvinotoolkit/openvino_tokenizers.git --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --config-settings=override=cmake.options.ENABLE_FAST_TOKENIZERS=OFF
```

## Usage

:warning: OpenVINO Tokenizers can be inferred on a `CPU` device only.
Expand Down
80 changes: 80 additions & 0 deletions cmake/external/icu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
include(FetchContent)

set(THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/_deps/icu)
set(ICU_SOURCE_DIR ${THIRD_PARTY_PATH}/icu-src)
set(ICU_BINARY_DIR ${THIRD_PARTY_PATH}/icu-build)
SET(ICU_INSTALL_DIR ${THIRD_PARTY_PATH}/icu-install)

set(HOST_ENV_CMAKE ${CMAKE_COMMAND} -E env
CC=${CMAKE_C_COMPILER}
CXX=${CMAKE_CXX_COMPILER}
CFLAGS=${CMAKE_C_FLAGS}
CXXFLAGS=${CMAKE_CXX_FLAGS}
LDFLAGS=${CMAKE_MODULE_LINKER_FLAGS}
)

if(GENERATOR_IS_MULTI_CONFIG_VAR)
set(ICU_CONFIGURE_FLAGS $<$<CONFIG:Debug>:"--enable-debug">$<$<CONFIG:Release>:"--enable-release">)
set(ICU_BUILD_TYPE $<CONFIG>)
else()
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(ICU_CONFIGURE_FLAGS "--enable-debug")
else()
set(ICU_CONFIGURE_FLAGS "--enable-release")
endif()
set(ICU_BUILD_TYPE ${CMAKE_BUILD_TYPE})
endif()

set(FETCHCONTENT_QUIET FALSE)
# Fetch and build ICU
FetchContent_Declare(
ICU
URL https://github.com/unicode-org/icu/archive/refs/tags/release-70-1.tar.gz
URL_HASH SHA256=f30d670bdc03ba999638a2d2511952ab94adf204d0e14898666f2e0cacb7fef1
SOURCE_DIR ${ICU_SOURCE_DIR}
BINARY_DIR ${ICU_BINARY_DIR}
DOWNLOAD_EXTRACT_TIMESTAMP TRUE
)

FetchContent_MakeAvailable(ICU)

if(NOT ICU_POPULATED)
# Configure the ICU build
message(STATUS "Configuring ICU...")
execute_process(
COMMAND ${ICU_SOURCE_DIR}/icu4c/source/runConfigureICU Linux --prefix ${ICU_INSTALL_DIR} ${ICU_CONFIGURE_FLAGS}
--disable-tests
--disable-samples
--disable-tools
--disable-extras
--disable-icuio
--disable-draft
WORKING_DIRECTORY ${ICU_BINARY_DIR}
)
message(STATUS "Building ICU...")
execute_process(
COMMAND make -j${CMAKE_JOB_POOL_SIZE}
WORKING_DIRECTORY ${ICU_BINARY_DIR}
)
message(STATUS "Installing ICU...")
execute_process(
COMMAND make install
WORKING_DIRECTORY ${ICU_BINARY_DIR}
)
endif()
# Manually set ICU include and library directories
set(ICU_ROOT ${ICU_INSTALL_DIR})

if(WIN32)
set(SHARED_LIB_EXT "*.dll")
elseif(APPLE)
set(SHARED_LIB_EXT "*.dylib")
else()
set(SHARED_LIB_EXT "*.so")
endif()

install(
DIRECTORY ${ICU_INSTALL_DIR}/lib/
DESTINATION $<TARGET_FILE_DIR:${TARGET_NAME}>
FILES_MATCHING PATTERN "${SHARED_LIB_EXT}"
)
7 changes: 2 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,14 @@ transformers = [
"transformers[sentencepiece] >= 4.36.0",
"tiktoken"
]
# chatglm2 custom tokenizer file imports torch, have to add torch dependency for tests
torch = [
'torch'
]
dev = [
"ruff",
"bandit",
"pytest",
"pytest_harvest",
"pandas",
"openvino_tokenizers[transformers, torch]"
"jinja2",
"openvino_tokenizers[transformers]"
]
benchmark = [
"pandas",
Expand Down
Loading
Loading