Skip to content

Commit

Permalink
Release 0.7.7
Browse files Browse the repository at this point in the history
Co-authored-by: Michał Bartoszkiewicz <[email protected]>
Co-authored-by: Jan Chorowski <[email protected]>
Co-authored-by: Xavier Gendre <[email protected]>
Co-authored-by: Adrian Kosowski <[email protected]>
Co-authored-by: Jakub Kowalski <[email protected]>
Co-authored-by: Sergey Kulik <[email protected]>
Co-authored-by: Mateusz Lewandowski <[email protected]>
Co-authored-by: Mohamed Malhou <[email protected]>
Co-authored-by: Krzysztof Nowicki <[email protected]>
Co-authored-by: Richard Pelgrim <[email protected]>
Co-authored-by: Kamil Piechowiak <[email protected]>
Co-authored-by: Paweł Podhajski <[email protected]>
Co-authored-by: Olivier Ruas <[email protected]>
Co-authored-by: Przemysław Uznański <[email protected]>
Co-authored-by: Sebastian Włudzik <[email protected]>
GitOrigin-RevId: 312344420a55f049c50addb049b77b403a5ce194
  • Loading branch information
16 people committed Dec 27, 2023
1 parent 06c1ad0 commit c371b11
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 2 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]

## [0.7.7] - 2023-12-27

### Added
- pathway.xpacks.llm.splitter.TokenCountSplitter.

## [0.7.6] - 2023-12-22

## New Features
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "pathway"
version = "0.7.6"
version = "0.7.7"
edition = "2021"
publish = false
rust-version = "1.72.0"
Expand Down
77 changes: 77 additions & 0 deletions python/pathway/xpacks/llm/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
A library of text spliiters - routines which slit a long text into smaller chunks.
"""

import unicodedata
from typing import Dict, List, Tuple


Expand All @@ -17,3 +18,79 @@ def null_splitter(txt: str) -> List[Tuple[str, Dict]]:
The null splitter always return a list of length one containing the full text and empty metadata.
"""
return [(txt, {})]


def _normalize_unicode(text: str):
"""
Get rid of ligatures
"""
return unicodedata.normalize("NFKC", text)


class TokenCountSplitter:
"""
Splits a given string or a list of strings into chunks based on token
count.
This splitter tokenizes the input texts and splits them into smaller parts ("chunks")
ensuring that each chunk has a token count between `min_tokens` and
`max_tokens`. It also attempts to break chunks at sensible points such as
punctuation marks.
Arguments:
min_tokens: minimum tokens in a chunk of text.
max_tokens: maximum size of a chunk in tokens.
encoding_name: name of the encoding from `tiktoken`.
Example:
# >>> from pathway.xpacks.llm.splitter import TokenCountSplitter
# >>> import pathway as pw
# >>> t = pw.debug.table_from_markdown(
# ... '''| text
# ... 1| cooltext'''
# ... )
# >>> splitter = TokenCountSplitter(min_tokens=1, max_tokens=1)
# >>> t += t.select(chunks = pw.apply(splitter, pw.this.text))
# >>> pw.debug.compute_and_print(t, include_id=False)
# text | chunks
# cooltext | (('cool', pw.Json({})), ('text', pw.Json({})))
"""

CHARS_PER_TOKEN = 3
PUNCTUATION = [".", "?", "!", "\n"]

def __init__(
self,
min_tokens: int = 50,
max_tokens: int = 500,
encoding_name: str = "cl100k_base",
):
self.min_tokens = min_tokens
self.max_tokens = max_tokens
self.encoding_name = encoding_name

def __call__(self, txt: str) -> List[Tuple[str, Dict]]:
import tiktoken

tokenizer = tiktoken.get_encoding(self.encoding_name)
text = _normalize_unicode(txt)
tokens = tokenizer.encode_ordinary(text)
output: List[Tuple[str, Dict]] = []
i = 0
while i < len(tokens):
chunk_tokens = tokens[i : i + self.max_tokens]
chunk = tokenizer.decode(chunk_tokens)
last_punctuation = max(
[chunk.rfind(p) for p in self.PUNCTUATION], default=-1
)
if (
last_punctuation != -1
and last_punctuation > self.CHARS_PER_TOKEN * self.min_tokens
):
chunk = chunk[: last_punctuation + 1]

i += len(tokenizer.encode_ordinary(chunk))

output.append((chunk, {}))
return output

0 comments on commit c371b11

Please sign in to comment.