Release 0.7.7

Co-authored-by: Michał Bartoszkiewicz <[email protected]> Co-authored-by: Jan Chorowski <[email protected]> Co-authored-by: Xavier Gendre <[email protected]> Co-authored-by: Adrian Kosowski <[email protected]> Co-authored-by: Jakub Kowalski <[email protected]> Co-authored-by: Sergey Kulik <[email protected]> Co-authored-by: Mateusz Lewandowski <[email protected]> Co-authored-by: Mohamed Malhou <[email protected]> Co-authored-by: Krzysztof Nowicki <[email protected]> Co-authored-by: Richard Pelgrim <[email protected]> Co-authored-by: Kamil Piechowiak <[email protected]> Co-authored-by: Paweł Podhajski <[email protected]> Co-authored-by: Olivier Ruas <[email protected]> Co-authored-by: Przemysław Uznański <[email protected]> Co-authored-by: Sebastian Włudzik <[email protected]> GitOrigin-RevId: 312344420a55f049c50addb049b77b403a5ce194
pathwaycom · Dec 27, 2023 · c371b11 · c371b11
1 parent 06c1ad0
commit c371b11
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased]
 
+## [0.7.7] - 2023-12-27
+
+### Added
+- pathway.xpacks.llm.splitter.TokenCountSplitter.
+
 ## [0.7.6] - 2023-12-22
 
 ## New Features

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "pathway"
-version = "0.7.6"
+version = "0.7.7"
 edition = "2021"
 publish = false
 rust-version = "1.72.0"

diff --git a/python/pathway/xpacks/llm/splitter.py b/python/pathway/xpacks/llm/splitter.py
@@ -2,6 +2,7 @@
 A library of text spliiters - routines which slit a long text into smaller chunks.
 """
 
+import unicodedata
 from typing import Dict, List, Tuple
 
 
@@ -17,3 +18,79 @@ def null_splitter(txt: str) -> List[Tuple[str, Dict]]:
     The null splitter always return a list of length one containing the full text and empty metadata.
     """
     return [(txt, {})]
+
+
+def _normalize_unicode(text: str):
+    """
+    Get rid of ligatures
+    """
+    return unicodedata.normalize("NFKC", text)
+
+
+class TokenCountSplitter:
+    """
+    Splits a given string or a list of strings into chunks based on token
+    count.
+
+    This splitter tokenizes the input texts and splits them into smaller parts ("chunks")
+    ensuring that each chunk has a token count between `min_tokens` and
+    `max_tokens`. It also attempts to break chunks at sensible points such as
+    punctuation marks.
+
+    Arguments:
+        min_tokens: minimum tokens in a chunk of text.
+        max_tokens: maximum size of a chunk in tokens.
+        encoding_name: name of the encoding from `tiktoken`.
+
+    Example:
+
+    # >>> from pathway.xpacks.llm.splitter import TokenCountSplitter
+    # >>> import pathway as pw
+    # >>> t  = pw.debug.table_from_markdown(
+    # ...     '''| text
+    # ... 1| cooltext'''
+    # ... )
+    # >>> splitter = TokenCountSplitter(min_tokens=1, max_tokens=1)
+    # >>> t += t.select(chunks = pw.apply(splitter, pw.this.text))
+    # >>> pw.debug.compute_and_print(t, include_id=False)
+    # text     | chunks
+    # cooltext | (('cool', pw.Json({})), ('text', pw.Json({})))
+    """
+
+    CHARS_PER_TOKEN = 3
+    PUNCTUATION = [".", "?", "!", "\n"]
+
+    def __init__(
+        self,
+        min_tokens: int = 50,
+        max_tokens: int = 500,
+        encoding_name: str = "cl100k_base",
+    ):
+        self.min_tokens = min_tokens
+        self.max_tokens = max_tokens
+        self.encoding_name = encoding_name
+
+    def __call__(self, txt: str) -> List[Tuple[str, Dict]]:
+        import tiktoken
+
+        tokenizer = tiktoken.get_encoding(self.encoding_name)
+        text = _normalize_unicode(txt)
+        tokens = tokenizer.encode_ordinary(text)
+        output: List[Tuple[str, Dict]] = []
+        i = 0
+        while i < len(tokens):
+            chunk_tokens = tokens[i : i + self.max_tokens]
+            chunk = tokenizer.decode(chunk_tokens)
+            last_punctuation = max(
+                [chunk.rfind(p) for p in self.PUNCTUATION], default=-1
+            )
+            if (
+                last_punctuation != -1
+                and last_punctuation > self.CHARS_PER_TOKEN * self.min_tokens
+            ):
+                chunk = chunk[: last_punctuation + 1]
+
+            i += len(tokenizer.encode_ordinary(chunk))
+
+            output.append((chunk, {}))
+        return output