From c371b117de57074e543919f347dc73abf37ba488 Mon Sep 17 00:00:00 2001
From: Manul from Pathway <github.manul@pathway.com>
Date: Wed, 27 Dec 2023 14:25:55 +0100
Subject: [PATCH] Release 0.7.7
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Michał Bartoszkiewicz <embe@pathway.com>
Co-authored-by: Jan Chorowski <janek@pathway.com>
Co-authored-by: Xavier Gendre <xavier@pathway.com>
Co-authored-by: Adrian Kosowski <adrian@pathway.com>
Co-authored-by: Jakub Kowalski <kuba@pathway.com>
Co-authored-by: Sergey Kulik <sergey@pathway.com>
Co-authored-by: Mateusz Lewandowski <mateusz@pathway.com>
Co-authored-by: Mohamed Malhou <mohamed@pathway.com>
Co-authored-by: Krzysztof Nowicki <krzysiek@pathway.com>
Co-authored-by: Richard Pelgrim <richard.pelgrim@pathway.com>
Co-authored-by: Kamil Piechowiak <kamil@pathway.com>
Co-authored-by: Paweł Podhajski <pawel.podhajski@pathway.com>
Co-authored-by: Olivier Ruas <olivier@pathway.com>
Co-authored-by: Przemysław Uznański <przemek@pathway.com>
Co-authored-by: Sebastian Włudzik <sebastian.wludzik@pathway.com>
GitOrigin-RevId: 312344420a55f049c50addb049b77b403a5ce194
---
 CHANGELOG.md                          |  5 ++
 Cargo.lock                            |  2 +-
 Cargo.toml                            |  2 +-
 python/pathway/xpacks/llm/splitter.py | 77 +++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7e4ca8ce..9a6a12b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased]
 
+## [0.7.7] - 2023-12-27
+
+### Added
+- pathway.xpacks.llm.splitter.TokenCountSplitter.
+
 ## [0.7.6] - 2023-12-22
 
 ## New Features
diff --git a/Cargo.lock b/Cargo.lock
index 631d055b..1565d4f7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1505,7 +1505,7 @@ dependencies = [
 
 [[package]]
 name = "pathway"
-version = "0.7.6"
+version = "0.7.7"
 dependencies = [
  "arc-swap",
  "arcstr",
diff --git a/Cargo.toml b/Cargo.toml
index 162e28a9..e324cc06 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "pathway"
-version = "0.7.6"
+version = "0.7.7"
 edition = "2021"
 publish = false
 rust-version = "1.72.0"
diff --git a/python/pathway/xpacks/llm/splitter.py b/python/pathway/xpacks/llm/splitter.py
index 983ded63..6e335166 100644
--- a/python/pathway/xpacks/llm/splitter.py
+++ b/python/pathway/xpacks/llm/splitter.py
@@ -2,6 +2,7 @@
 A library of text spliiters - routines which slit a long text into smaller chunks.
 """
 
+import unicodedata
 from typing import Dict, List, Tuple
 
 
@@ -17,3 +18,79 @@ def null_splitter(txt: str) -> List[Tuple[str, Dict]]:
     The null splitter always return a list of length one containing the full text and empty metadata.
     """
     return [(txt, {})]
+
+
+def _normalize_unicode(text: str):
+    """
+    Get rid of ligatures
+    """
+    return unicodedata.normalize("NFKC", text)
+
+
+class TokenCountSplitter:
+    """
+    Splits a given string or a list of strings into chunks based on token
+    count.
+
+    This splitter tokenizes the input texts and splits them into smaller parts ("chunks")
+    ensuring that each chunk has a token count between `min_tokens` and
+    `max_tokens`. It also attempts to break chunks at sensible points such as
+    punctuation marks.
+
+    Arguments:
+        min_tokens: minimum tokens in a chunk of text.
+        max_tokens: maximum size of a chunk in tokens.
+        encoding_name: name of the encoding from `tiktoken`.
+
+    Example:
+
+    # >>> from pathway.xpacks.llm.splitter import TokenCountSplitter
+    # >>> import pathway as pw
+    # >>> t  = pw.debug.table_from_markdown(
+    # ...     '''| text
+    # ... 1| cooltext'''
+    # ... )
+    # >>> splitter = TokenCountSplitter(min_tokens=1, max_tokens=1)
+    # >>> t += t.select(chunks = pw.apply(splitter, pw.this.text))
+    # >>> pw.debug.compute_and_print(t, include_id=False)
+    # text     | chunks
+    # cooltext | (('cool', pw.Json({})), ('text', pw.Json({})))
+    """
+
+    CHARS_PER_TOKEN = 3
+    PUNCTUATION = [".", "?", "!", "\n"]
+
+    def __init__(
+        self,
+        min_tokens: int = 50,
+        max_tokens: int = 500,
+        encoding_name: str = "cl100k_base",
+    ):
+        self.min_tokens = min_tokens
+        self.max_tokens = max_tokens
+        self.encoding_name = encoding_name
+
+    def __call__(self, txt: str) -> List[Tuple[str, Dict]]:
+        import tiktoken
+
+        tokenizer = tiktoken.get_encoding(self.encoding_name)
+        text = _normalize_unicode(txt)
+        tokens = tokenizer.encode_ordinary(text)
+        output: List[Tuple[str, Dict]] = []
+        i = 0
+        while i < len(tokens):
+            chunk_tokens = tokens[i : i + self.max_tokens]
+            chunk = tokenizer.decode(chunk_tokens)
+            last_punctuation = max(
+                [chunk.rfind(p) for p in self.PUNCTUATION], default=-1
+            )
+            if (
+                last_punctuation != -1
+                and last_punctuation > self.CHARS_PER_TOKEN * self.min_tokens
+            ):
+                chunk = chunk[: last_punctuation + 1]
+
+            i += len(tokenizer.encode_ordinary(chunk))
+
+            output.append((chunk, {}))
+        return output