Skip to content

Commit

Permalink
Merge pull request #642 from juancappi/feat/641-fixed-size-token-chun…
Browse files Browse the repository at this point in the history
…king

feat: #641 - fExtend document chunker transform to support fixed-size token window chunker with overlap- Python Only
  • Loading branch information
touma-I authored Oct 7, 2024
2 parents 75e3364 + 137fb2d commit d04454e
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 7 deletions.
4 changes: 3 additions & 1 deletion transforms/language/doc_chunk/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ The transform can be tuned with the following parameters.

| Parameter | Default | Description |
|------------|----------|--------------|
| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). |
| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. |
| `content_column_name` | `contents` | Name of the column containing the text to be chunked. |
| `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
| `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
| `chunk_size_tokens` | `128` | Size of the chunk in tokens for the token text chunker. |
| `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the token text chunker. |
| `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. |
| `output_source_doc_id_column_name` | `source_document_id` | Column name to store the `doc_id` from the input table. |
| `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |
Expand Down
73 changes: 72 additions & 1 deletion transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@
################################################################################

from abc import ABCMeta, abstractmethod
from typing import Iterator, Optional
from typing import Iterator, Optional, Dict, List

from docling_core.types import Document as DLDocument
from llama_index.core.node_parser.text.token import TokenTextSplitter
from llama_index.core import Document as LIDocument
from llama_index.core.node_parser import MarkdownNodeParser
from docling_core.transforms.chunker import HierarchicalChunker
Expand Down Expand Up @@ -66,3 +67,73 @@ def chunk(self, content: str) -> Iterator[dict]:
yield {
self.output_chunk_column_name: node.text,
}


class LITokenTextSplitter(ChunkingExecutor):
"""
A text chunker that leverages Llama Index's token-based text splitter. This splitter breaks input text into
fixed-window chunks, with each chunk measured in tokens rather than characters.
The chunking process ensures that each chunk contains a specific number of tokens, and an optional overlap between
chunks (also measured in tokens) can be specified to preserve context between the chunks.
Args:
output_chunk_column_name (str): Name of the output column containing the text of each chunk.
output_chunk_column_id (str): Name of the output column containing the ID of each chunk.
chunk_size_tokens (int): Length of each chunk in number of tokens.
chunk_overlap_tokens (int): Number of tokens overlapping between consecutive chunks.
Attributes:
output_chunk_column_name (str)
output_chunk_column_id (str)
chunk_size_tokens (int)
chunk_overlap_tokens (int)
"""

def __init__(
self,
output_chunk_column_name: str,
output_chunk_column_id: str,
chunk_size_tokens: int,
chunk_overlap_tokens: int
):
self.output_chunk_column_name = output_chunk_column_name
self.output_chunk_column_id = output_chunk_column_id
self.chunk_size = chunk_size_tokens
self.chunk_overlap = chunk_overlap_tokens


def _chunk_text(self, text: str) -> List[str]:
"""
Internal method to chunk text using TokenTextSplitter.
Args:
text (str): Input text to be chunked.
Returns:
List[str]: List of chunked text.
"""
text_splitter = TokenTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap
)
return text_splitter.split_text(text)


def chunk(self, text: str) -> Iterator[Dict]:
"""
Chunks input text into fixed-window lengths with token overlap.
Args:
text (str): Input text to be chunked.
Yields:
Dict: Chunked text with ID.
"""
chunk_id = 0
for chunk in self._chunk_text(text):
yield {
self.output_chunk_column_id: chunk_id,
self.output_chunk_column_name: chunk,
}
chunk_id += 1
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
from doc_chunk_transform_python import DocChunkPythonTransformConfiguration

from doc_chunk_transform import chunking_types

# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_md"))
# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_token_text"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
local_conf = {
"input_folder": input_folder,
Expand All @@ -39,6 +40,11 @@
# doc_chunk params
# "doc_chunk_chunking_type": "li_markdown",
"doc_chunk_chunking_type": "dl_json",
# "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT,
# fixed-size params
# "doc_chunk_output_chunk_column_name": "chunk_text",
# "doc_chunk_chunk_size_tokens": 128,
# "doc_chunk_chunk_overlap_tokens": 30
}
if __name__ == "__main__":
# Set the simulated command line args
Expand Down
37 changes: 34 additions & 3 deletions transforms/language/doc_chunk/python/src/doc_chunk_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pyarrow as pa
from data_processing.transform import AbstractTableTransform, TransformConfiguration
from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown
from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, LITokenTextSplitter


short_name = "doc_chunk"
Expand All @@ -27,7 +27,10 @@
doc_id_column_name_key = "doc_id_column_name"
chunking_type_key = "chunking_type"
dl_min_chunk_len_key = "dl_min_chunk_len"
chunk_size_tokens_key = "chunk_size_tokens"
chunk_overlap_tokens_key = "chunk_overlap_tokens"
output_chunk_column_name_key = "output_chunk_column_name"
output_chunk_column_id_key = "output_chunk_column_id"
output_source_doc_id_column_name_key = "output_source_doc_id_column_name"
output_jsonpath_column_name_key = "output_jsonpath_column_name"
output_pageno_column_name_key = "output_pageno_column_name"
Expand All @@ -41,11 +44,13 @@
output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}"

chunk_size_tokens_cli_param = f"{cli_prefix}{chunk_size_tokens_key}"
chunk_overlap_tokens_cli_param = f"{cli_prefix}{chunk_overlap_tokens_key}"

class chunking_types(str, enum.Enum):
LI_MARKDOWN = "li_markdown"
DL_JSON = "dl_json"
LI_TOKEN_TEXT = "li_token_text"

def __str__(self):
return str(self.value)
Expand All @@ -56,11 +61,13 @@ def __str__(self):
default_chunking_type = chunking_types.DL_JSON
default_dl_min_chunk_len = None
default_output_chunk_column_name = "contents"
default_output_chunk_column_id = "chunk_id"
default_output_source_doc_id_column_name = "source_document_id"
default_output_jsonpath_column_name = "doc_jsonpath"
default_output_pageno_column_name = "page_number"
default_output_bbox_column_name = "bbox"

default_chunk_size_tokens = 128
default_chunk_overlap_tokens = 30

class DocChunkTransform(AbstractTableTransform):
"""
Expand All @@ -84,6 +91,7 @@ def __init__(self, config: dict[str, Any]):
self.content_column_name = config.get(content_column_name_key, default_content_column_name)
self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name)
self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
self.output_chunk_column_id = config.get(output_chunk_column_id_key, default_output_chunk_column_id)
self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)

# Parameters for Docling JSON chunking
Expand All @@ -96,6 +104,10 @@ def __init__(self, config: dict[str, Any]):
)
self.output_bbox_column_name_key = config.get(output_bbox_column_name_key, default_output_bbox_column_name)

# Parameters for Fixed-size with overlap chunking
self.chunk_size_tokens = config.get(chunk_size_tokens_key, default_chunk_size_tokens)
self.chunk_overlap_tokens = config.get(chunk_overlap_tokens_key, default_chunk_overlap_tokens)

# Initialize chunker

self.chunker: ChunkingExecutor
Expand All @@ -111,6 +123,13 @@ def __init__(self, config: dict[str, Any]):
self.chunker = LIMarkdown(
output_chunk_column_name=self.output_chunk_column_name,
)
elif self.chunking_type == chunking_types.LI_TOKEN_TEXT:
self.chunker = LITokenTextSplitter(
output_chunk_column_name=self.output_chunk_column_name,
output_chunk_column_id=self.output_chunk_column_id,
chunk_size_tokens=self.chunk_size_tokens,
chunk_overlap_tokens=self.chunk_overlap_tokens
)
else:
raise RuntimeError(f"{self.chunking_type=} is not valid.")

Expand Down Expand Up @@ -213,6 +232,18 @@ def add_input_params(self, parser: ArgumentParser) -> None:
default=default_output_bbox_column_name,
help="Column name to store the bbox of the chunk",
)
parser.add_argument(
f"--{chunk_size_tokens_cli_param}",
default=default_chunk_size_tokens,
type=int,
help="Size of the chunk in tokens for the fixed-sized chunker",
)
parser.add_argument(
f"--{chunk_overlap_tokens_cli_param}",
default=default_chunk_overlap_tokens,
type=int,
help="Number of tokens overlapping between chunks for the fixed-sized chunker.",
)

def apply_input_params(self, args: Namespace) -> bool:
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-04 14:00:40",
"end_time": "2024-10-04 14:00:41",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"chunking_type": "li_token_text",
"content_column_name": "contents",
"doc_id_column_name": "document_id",
"dl_min_chunk_len": null,
"output_chunk_column_name": "chunk_text",
"output_source_doc_id_column_name": "source_document_id",
"output_jsonpath_column_name": "doc_jsonpath",
"output_pageno_column_name": "page_number",
"output_bbox_column_name": "bbox",
"chunk_size_tokens": 128,
"chunk_overlap_tokens": 30,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".parquet"
],
"num_processors": 0
},
"job_output_stats": {
"source_files": 1,
"source_size": 17749,
"result_files": 1,
"result_size": 8827,
"processing_time": 0.194,
"nfiles": 1,
"nrows": 10,
"source_doc_count": 2,
"result_doc_count": 10
},
"source": {
"name": "/Users/jmcappi/git/ai-foundation/data-prep-kit/transforms/language/doc_chunk/python/test-data/input_token_text",
"type": "path"
},
"target": {
"name": "/Users/jmcappi/git/ai-foundation/data-prep-kit/transforms/language/doc_chunk/python/output",
"type": "path"
}
}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
from data_processing.test_support.launch.transform_test import (
AbstractTransformLauncherTest,
)
from doc_chunk_transform import chunking_type_cli_param, chunking_types
from doc_chunk_transform import (
chunking_type_cli_param,
output_chunk_column_name_cli_param,
chunking_types
)
from doc_chunk_transform_python import DocChunkPythonTransformConfiguration


Expand Down Expand Up @@ -55,4 +59,17 @@ def get_test_transform_fixtures(self) -> list[tuple]:
basedir + "/expected_md",
)
)

# Run with fixed size token chunker
fixtures.append(
(
launcher,
{
chunking_type_cli_param: chunking_types.LI_TOKEN_TEXT,
output_chunk_column_name_cli_param: "chunk_text"
},
basedir + "/input_token_text",
basedir + "/expected_token_text",
)
)
return fixtures

0 comments on commit d04454e

Please sign in to comment.