diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md index fbacf4ade..9abca2b79 100644 --- a/transforms/language/doc_chunk/python/README.md +++ b/transforms/language/doc_chunk/python/README.md @@ -32,7 +32,6 @@ The transform can be tuned with the following parameters. | `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. | | `content_column_name` | `contents` | Name of the column containing the text to be chunked. | | `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. | -| `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. | | `chunk_size_tokens` | `128` | Size of the chunk in tokens for the token text chunker. | | `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the token text chunker. | | `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. | diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml index eeff859f0..c9728712e 100644 --- a/transforms/language/doc_chunk/python/pyproject.toml +++ b/transforms/language/doc_chunk/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_chunk_transform_python" -version = "0.2.2.dev1" +version = "0.3.0" requires-python = ">=3.10,<3.13" description = "chunk documents Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index 2db4bd1f1..446998895 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,3 +1,3 @@ data-prep-toolkit==0.2.2.dev1 -docling-core==1.7.2 +docling-core==2.3.0 llama-index-core>=0.11.0,<0.12.0 diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py index a8ba44f61..52c3bc978 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py @@ -13,11 +13,12 @@ from abc import ABCMeta, abstractmethod from typing import Iterator, Optional, Dict, List -from docling_core.types import Document as DLDocument +from docling_core.types.doc import DoclingDocument from llama_index.core.node_parser.text.token import TokenTextSplitter from llama_index.core import Document as LIDocument from llama_index.core.node_parser import MarkdownNodeParser from docling_core.transforms.chunker import HierarchicalChunker +from docling_core.transforms.chunker.hierarchical_chunker import DocChunk class ChunkingExecutor(metaclass=ABCMeta): @@ -29,7 +30,6 @@ def chunk(self, content: str) -> Iterator[dict]: class DLJsonChunker(ChunkingExecutor): def __init__( self, - min_chunk_len: Optional[int], output_chunk_column_name: str, output_jsonpath_column_name: str, output_pageno_column_name_key: str, @@ -40,19 +40,19 @@ def __init__( self.output_pageno_column_name_key = output_pageno_column_name_key self.output_bbox_column_name_key = output_bbox_column_name_key - chunker_kwargs = dict(include_metadata=True) - if min_chunk_len is not None: - chunker_kwargs["min_chunk_len"] = min_chunk_len - self._chunker = HierarchicalChunker(**chunker_kwargs) + self._chunker = HierarchicalChunker() def chunk(self, content: str) -> Iterator[dict]: - doc = DLDocument.model_validate_json(content) + doc = DoclingDocument.model_validate_json(content) for chunk in self._chunker.chunk(doc): + chunk: DocChunk + doc_item = chunk.meta.doc_items[0] + prov = doc_item.prov[0] yield { self.output_chunk_column_name: chunk.text, - self.output_jsonpath_column_name: chunk.path, - self.output_pageno_column_name_key: chunk.page, - self.output_bbox_column_name_key: chunk.bbox, + self.output_jsonpath_column_name: doc_item.self_ref, + self.output_pageno_column_name_key: prov.page_no, + self.output_bbox_column_name_key: prov.bbox.as_tuple(), } diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py index 7acdd3ef1..da5540cba 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py @@ -26,7 +26,6 @@ content_column_name_key = "content_column_name" doc_id_column_name_key = "doc_id_column_name" chunking_type_key = "chunking_type" -dl_min_chunk_len_key = "dl_min_chunk_len" chunk_size_tokens_key = "chunk_size_tokens" chunk_overlap_tokens_key = "chunk_overlap_tokens" output_chunk_column_name_key = "output_chunk_column_name" @@ -38,7 +37,6 @@ content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}" doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}" chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}" -dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}" output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}" output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}" output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}" @@ -59,7 +57,6 @@ def __str__(self): default_content_column_name = "contents" default_doc_id_column_name = "document_id" default_chunking_type = chunking_types.DL_JSON -default_dl_min_chunk_len = None default_output_chunk_column_name = "contents" default_output_chunk_column_id = "chunk_id" default_output_source_doc_id_column_name = "source_document_id" @@ -95,7 +92,6 @@ def __init__(self, config: dict[str, Any]): self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name) # Parameters for Docling JSON chunking - self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len) self.output_jsonpath_column_name = config.get( output_jsonpath_column_name_key, default_output_jsonpath_column_name ) @@ -113,7 +109,6 @@ def __init__(self, config: dict[str, Any]): self.chunker: ChunkingExecutor if self.chunking_type == chunking_types.DL_JSON: self.chunker = DLJsonChunker( - min_chunk_len=self.dl_min_chunk_len, output_chunk_column_name=self.output_chunk_column_name, output_jsonpath_column_name=self.output_jsonpath_column_name, output_pageno_column_name_key=self.output_pageno_column_name_key, @@ -202,11 +197,6 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=default_doc_id_column_name, help="Name of the column containing the doc_id to be propagated in the output", ) - parser.add_argument( - f"--{dl_min_chunk_len_cli_param}", - default=default_dl_min_chunk_len, - help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.", - ) parser.add_argument( f"--{output_chunk_column_name_cli_param}", default=default_output_chunk_column_name, diff --git a/transforms/language/doc_chunk/python/test-data/expected/metadata.json b/transforms/language/doc_chunk/python/test-data/expected/metadata.json index 7eeaaa279..e83a0375b 100644 --- a/transforms/language/doc_chunk/python/test-data/expected/metadata.json +++ b/transforms/language/doc_chunk/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "doc_chunk", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 14:05:09", - "end_time": "2024-10-18 14:05:11", + "start_time": "2024-10-30 18:38:40", + "end_time": "2024-10-30 18:38:40", "status": "success" }, "code": { @@ -18,7 +18,6 @@ "chunking_type": "dl_json", "content_column_name": "contents", "doc_id_column_name": "document_id", - "dl_min_chunk_len": null, "output_chunk_column_name": "contents", "output_source_doc_id_column_name": "source_document_id", "output_jsonpath_column_name": "doc_jsonpath", @@ -35,22 +34,22 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 27.9, + "cpus": 19.5, "gpus": 0, - "memory": 25.75, + "memory": 27.48, "object_store": 0, - "execution time, min": 0.021 + "execution time, min": 0.001 }, "job_output_stats": { "source_files": 1, - "source_size": 50276, + "source_size": 12073, "result_files": 1, - "result_size": 31223, - "processing_time": 1.266, + "result_size": 14363, + "processing_time": 0.043, "nfiles": 1, - "nrows": 88, + "nrows": 39, "source_doc_count": 1, - "result_doc_count": 88 + "result_doc_count": 39 }, "source": { "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input", diff --git a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet index 06089be78..46714dde7 100644 Binary files a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet and b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet differ diff --git a/transforms/language/doc_chunk/python/test-data/input/test1.parquet b/transforms/language/doc_chunk/python/test-data/input/test1.parquet index 4015fccb0..32905aa74 100644 Binary files a/transforms/language/doc_chunk/python/test-data/input/test1.parquet and b/transforms/language/doc_chunk/python/test-data/input/test1.parquet differ diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 4240ae263..aa4e5d093 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_chunk_transform_ray" -version = "0.2.2.dev1" +version = "0.3.0" requires-python = ">=3.10,<3.13" description = "chunk documents Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "dpk-doc-chunk-transform-python==0.2.2.dev1", + "dpk-doc-chunk-transform-python==0.3.0", "data-prep-toolkit[ray]==0.2.2.dev1", ] diff --git a/transforms/language/doc_chunk/ray/test-data/expected/metadata.json b/transforms/language/doc_chunk/ray/test-data/expected/metadata.json index 7eeaaa279..e83a0375b 100644 --- a/transforms/language/doc_chunk/ray/test-data/expected/metadata.json +++ b/transforms/language/doc_chunk/ray/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "doc_chunk", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 14:05:09", - "end_time": "2024-10-18 14:05:11", + "start_time": "2024-10-30 18:38:40", + "end_time": "2024-10-30 18:38:40", "status": "success" }, "code": { @@ -18,7 +18,6 @@ "chunking_type": "dl_json", "content_column_name": "contents", "doc_id_column_name": "document_id", - "dl_min_chunk_len": null, "output_chunk_column_name": "contents", "output_source_doc_id_column_name": "source_document_id", "output_jsonpath_column_name": "doc_jsonpath", @@ -35,22 +34,22 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 27.9, + "cpus": 19.5, "gpus": 0, - "memory": 25.75, + "memory": 27.48, "object_store": 0, - "execution time, min": 0.021 + "execution time, min": 0.001 }, "job_output_stats": { "source_files": 1, - "source_size": 50276, + "source_size": 12073, "result_files": 1, - "result_size": 31223, - "processing_time": 1.266, + "result_size": 14363, + "processing_time": 0.043, "nfiles": 1, - "nrows": 88, + "nrows": 39, "source_doc_count": 1, - "result_doc_count": 88 + "result_doc_count": 39 }, "source": { "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input", diff --git a/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet b/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet index 06089be78..46714dde7 100644 Binary files a/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet and b/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet differ diff --git a/transforms/language/doc_chunk/ray/test-data/input/test1.parquet b/transforms/language/doc_chunk/ray/test-data/input/test1.parquet index 4015fccb0..32905aa74 100644 Binary files a/transforms/language/doc_chunk/ray/test-data/input/test1.parquet and b/transforms/language/doc_chunk/ray/test-data/input/test1.parquet differ diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index 1e80b5077..ed15a8981 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,6 +1,6 @@ data-prep-toolkit==0.2.2.dev1 -docling-core==2.2.2 -docling-ibm-models==2.0.1 +docling-core==2.3.0 +docling-ibm-models==2.0.3 deepsearch-glm==0.26.1 -docling==2.2.1 +docling==2.3.0 filetype >=1.2.0, <2.0.0