From c371b117de57074e543919f347dc73abf37ba488 Mon Sep 17 00:00:00 2001 From: Manul from Pathway Date: Wed, 27 Dec 2023 14:25:55 +0100 Subject: [PATCH] Release 0.7.7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Michał Bartoszkiewicz Co-authored-by: Jan Chorowski Co-authored-by: Xavier Gendre Co-authored-by: Adrian Kosowski Co-authored-by: Jakub Kowalski Co-authored-by: Sergey Kulik Co-authored-by: Mateusz Lewandowski Co-authored-by: Mohamed Malhou Co-authored-by: Krzysztof Nowicki Co-authored-by: Richard Pelgrim Co-authored-by: Kamil Piechowiak Co-authored-by: Paweł Podhajski Co-authored-by: Olivier Ruas Co-authored-by: Przemysław Uznański Co-authored-by: Sebastian Włudzik GitOrigin-RevId: 312344420a55f049c50addb049b77b403a5ce194 --- CHANGELOG.md | 5 ++ Cargo.lock | 2 +- Cargo.toml | 2 +- python/pathway/xpacks/llm/splitter.py | 77 +++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e4ca8ce..9a6a12b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +## [0.7.7] - 2023-12-27 + +### Added +- pathway.xpacks.llm.splitter.TokenCountSplitter. + ## [0.7.6] - 2023-12-22 ## New Features diff --git a/Cargo.lock b/Cargo.lock index 631d055b..1565d4f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1505,7 +1505,7 @@ dependencies = [ [[package]] name = "pathway" -version = "0.7.6" +version = "0.7.7" dependencies = [ "arc-swap", "arcstr", diff --git a/Cargo.toml b/Cargo.toml index 162e28a9..e324cc06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pathway" -version = "0.7.6" +version = "0.7.7" edition = "2021" publish = false rust-version = "1.72.0" diff --git a/python/pathway/xpacks/llm/splitter.py b/python/pathway/xpacks/llm/splitter.py index 983ded63..6e335166 100644 --- a/python/pathway/xpacks/llm/splitter.py +++ b/python/pathway/xpacks/llm/splitter.py @@ -2,6 +2,7 @@ A library of text spliiters - routines which slit a long text into smaller chunks. """ +import unicodedata from typing import Dict, List, Tuple @@ -17,3 +18,79 @@ def null_splitter(txt: str) -> List[Tuple[str, Dict]]: The null splitter always return a list of length one containing the full text and empty metadata. """ return [(txt, {})] + + +def _normalize_unicode(text: str): + """ + Get rid of ligatures + """ + return unicodedata.normalize("NFKC", text) + + +class TokenCountSplitter: + """ + Splits a given string or a list of strings into chunks based on token + count. + + This splitter tokenizes the input texts and splits them into smaller parts ("chunks") + ensuring that each chunk has a token count between `min_tokens` and + `max_tokens`. It also attempts to break chunks at sensible points such as + punctuation marks. + + Arguments: + min_tokens: minimum tokens in a chunk of text. + max_tokens: maximum size of a chunk in tokens. + encoding_name: name of the encoding from `tiktoken`. + + Example: + + # >>> from pathway.xpacks.llm.splitter import TokenCountSplitter + # >>> import pathway as pw + # >>> t = pw.debug.table_from_markdown( + # ... '''| text + # ... 1| cooltext''' + # ... ) + # >>> splitter = TokenCountSplitter(min_tokens=1, max_tokens=1) + # >>> t += t.select(chunks = pw.apply(splitter, pw.this.text)) + # >>> pw.debug.compute_and_print(t, include_id=False) + # text | chunks + # cooltext | (('cool', pw.Json({})), ('text', pw.Json({}))) + """ + + CHARS_PER_TOKEN = 3 + PUNCTUATION = [".", "?", "!", "\n"] + + def __init__( + self, + min_tokens: int = 50, + max_tokens: int = 500, + encoding_name: str = "cl100k_base", + ): + self.min_tokens = min_tokens + self.max_tokens = max_tokens + self.encoding_name = encoding_name + + def __call__(self, txt: str) -> List[Tuple[str, Dict]]: + import tiktoken + + tokenizer = tiktoken.get_encoding(self.encoding_name) + text = _normalize_unicode(txt) + tokens = tokenizer.encode_ordinary(text) + output: List[Tuple[str, Dict]] = [] + i = 0 + while i < len(tokens): + chunk_tokens = tokens[i : i + self.max_tokens] + chunk = tokenizer.decode(chunk_tokens) + last_punctuation = max( + [chunk.rfind(p) for p in self.PUNCTUATION], default=-1 + ) + if ( + last_punctuation != -1 + and last_punctuation > self.CHARS_PER_TOKEN * self.min_tokens + ): + chunk = chunk[: last_punctuation + 1] + + i += len(tokenizer.encode_ordinary(chunk)) + + output.append((chunk, {})) + return output