Skip to content

Commit

Permalink
update doc_chunk with new docling v2
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm committed Oct 30, 2024
1 parent 7e5ea90 commit e929903
Show file tree
Hide file tree
Showing 13 changed files with 37 additions and 50 deletions.
1 change: 0 additions & 1 deletion transforms/language/doc_chunk/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ The transform can be tuned with the following parameters.
| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. |
| `content_column_name` | `contents` | Name of the column containing the text to be chunked. |
| `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
| `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
| `chunk_size_tokens` | `128` | Size of the chunk in tokens for the token text chunker. |
| `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the token text chunker. |
| `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. |
Expand Down
2 changes: 1 addition & 1 deletion transforms/language/doc_chunk/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_doc_chunk_transform_python"
version = "0.2.2.dev1"
version = "0.3.0"
requires-python = ">=3.10,<3.13"
description = "chunk documents Python Transform"
license = {text = "Apache-2.0"}
Expand Down
2 changes: 1 addition & 1 deletion transforms/language/doc_chunk/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
data-prep-toolkit==0.2.2.dev1
docling-core==1.7.2
docling-core==2.3.0
llama-index-core>=0.11.0,<0.12.0
20 changes: 10 additions & 10 deletions transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
from abc import ABCMeta, abstractmethod
from typing import Iterator, Optional, Dict, List

from docling_core.types import Document as DLDocument
from docling_core.types.doc import DoclingDocument
from llama_index.core.node_parser.text.token import TokenTextSplitter
from llama_index.core import Document as LIDocument
from llama_index.core.node_parser import MarkdownNodeParser
from docling_core.transforms.chunker import HierarchicalChunker
from docling_core.transforms.chunker.hierarchical_chunker import DocChunk


class ChunkingExecutor(metaclass=ABCMeta):
Expand All @@ -29,7 +30,6 @@ def chunk(self, content: str) -> Iterator[dict]:
class DLJsonChunker(ChunkingExecutor):
def __init__(
self,
min_chunk_len: Optional[int],
output_chunk_column_name: str,
output_jsonpath_column_name: str,
output_pageno_column_name_key: str,
Expand All @@ -40,19 +40,19 @@ def __init__(
self.output_pageno_column_name_key = output_pageno_column_name_key
self.output_bbox_column_name_key = output_bbox_column_name_key

chunker_kwargs = dict(include_metadata=True)
if min_chunk_len is not None:
chunker_kwargs["min_chunk_len"] = min_chunk_len
self._chunker = HierarchicalChunker(**chunker_kwargs)
self._chunker = HierarchicalChunker()

def chunk(self, content: str) -> Iterator[dict]:
doc = DLDocument.model_validate_json(content)
doc = DoclingDocument.model_validate_json(content)
for chunk in self._chunker.chunk(doc):
chunk: DocChunk
doc_item = chunk.meta.doc_items[0]
prov = doc_item.prov[0]
yield {
self.output_chunk_column_name: chunk.text,
self.output_jsonpath_column_name: chunk.path,
self.output_pageno_column_name_key: chunk.page,
self.output_bbox_column_name_key: chunk.bbox,
self.output_jsonpath_column_name: doc_item.self_ref,
self.output_pageno_column_name_key: prov.page_no,
self.output_bbox_column_name_key: prov.bbox.as_tuple(),
}


Expand Down
10 changes: 0 additions & 10 deletions transforms/language/doc_chunk/python/src/doc_chunk_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
content_column_name_key = "content_column_name"
doc_id_column_name_key = "doc_id_column_name"
chunking_type_key = "chunking_type"
dl_min_chunk_len_key = "dl_min_chunk_len"
chunk_size_tokens_key = "chunk_size_tokens"
chunk_overlap_tokens_key = "chunk_overlap_tokens"
output_chunk_column_name_key = "output_chunk_column_name"
Expand All @@ -38,7 +37,6 @@
content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}"
chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}"
output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
Expand All @@ -59,7 +57,6 @@ def __str__(self):
default_content_column_name = "contents"
default_doc_id_column_name = "document_id"
default_chunking_type = chunking_types.DL_JSON
default_dl_min_chunk_len = None
default_output_chunk_column_name = "contents"
default_output_chunk_column_id = "chunk_id"
default_output_source_doc_id_column_name = "source_document_id"
Expand Down Expand Up @@ -95,7 +92,6 @@ def __init__(self, config: dict[str, Any]):
self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)

# Parameters for Docling JSON chunking
self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
self.output_jsonpath_column_name = config.get(
output_jsonpath_column_name_key, default_output_jsonpath_column_name
)
Expand All @@ -113,7 +109,6 @@ def __init__(self, config: dict[str, Any]):
self.chunker: ChunkingExecutor
if self.chunking_type == chunking_types.DL_JSON:
self.chunker = DLJsonChunker(
min_chunk_len=self.dl_min_chunk_len,
output_chunk_column_name=self.output_chunk_column_name,
output_jsonpath_column_name=self.output_jsonpath_column_name,
output_pageno_column_name_key=self.output_pageno_column_name_key,
Expand Down Expand Up @@ -202,11 +197,6 @@ def add_input_params(self, parser: ArgumentParser) -> None:
default=default_doc_id_column_name,
help="Name of the column containing the doc_id to be propagated in the output",
)
parser.add_argument(
f"--{dl_min_chunk_len_cli_param}",
default=default_dl_min_chunk_len,
help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.",
)
parser.add_argument(
f"--{output_chunk_column_name_cli_param}",
default=default_output_chunk_column_name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-18 14:05:09",
"end_time": "2024-10-18 14:05:11",
"start_time": "2024-10-30 18:38:40",
"end_time": "2024-10-30 18:38:40",
"status": "success"
},
"code": {
Expand All @@ -18,7 +18,6 @@
"chunking_type": "dl_json",
"content_column_name": "contents",
"doc_id_column_name": "document_id",
"dl_min_chunk_len": null,
"output_chunk_column_name": "contents",
"output_source_doc_id_column_name": "source_document_id",
"output_jsonpath_column_name": "doc_jsonpath",
Expand All @@ -35,22 +34,22 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 27.9,
"cpus": 19.5,
"gpus": 0,
"memory": 25.75,
"memory": 27.48,
"object_store": 0,
"execution time, min": 0.021
"execution time, min": 0.001
},
"job_output_stats": {
"source_files": 1,
"source_size": 50276,
"source_size": 12073,
"result_files": 1,
"result_size": 31223,
"processing_time": 1.266,
"result_size": 14363,
"processing_time": 0.043,
"nfiles": 1,
"nrows": 88,
"nrows": 39,
"source_doc_count": 1,
"result_doc_count": 88
"result_doc_count": 39
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",
Expand Down
Binary file not shown.
Binary file not shown.
4 changes: 2 additions & 2 deletions transforms/language/doc_chunk/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_doc_chunk_transform_ray"
version = "0.2.2.dev1"
version = "0.3.0"
requires-python = ">=3.10,<3.13"
description = "chunk documents Ray Transform"
license = {text = "Apache-2.0"}
Expand All @@ -11,7 +11,7 @@ authors = [
{ name = "Christoph Auer", email = "[email protected]" },
]
dependencies = [
"dpk-doc-chunk-transform-python==0.2.2.dev1",
"dpk-doc-chunk-transform-python==0.3.0",
"data-prep-toolkit[ray]==0.2.2.dev1",
]

Expand Down
21 changes: 10 additions & 11 deletions transforms/language/doc_chunk/ray/test-data/expected/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-18 14:05:09",
"end_time": "2024-10-18 14:05:11",
"start_time": "2024-10-30 18:38:40",
"end_time": "2024-10-30 18:38:40",
"status": "success"
},
"code": {
Expand All @@ -18,7 +18,6 @@
"chunking_type": "dl_json",
"content_column_name": "contents",
"doc_id_column_name": "document_id",
"dl_min_chunk_len": null,
"output_chunk_column_name": "contents",
"output_source_doc_id_column_name": "source_document_id",
"output_jsonpath_column_name": "doc_jsonpath",
Expand All @@ -35,22 +34,22 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 27.9,
"cpus": 19.5,
"gpus": 0,
"memory": 25.75,
"memory": 27.48,
"object_store": 0,
"execution time, min": 0.021
"execution time, min": 0.001
},
"job_output_stats": {
"source_files": 1,
"source_size": 50276,
"source_size": 12073,
"result_files": 1,
"result_size": 31223,
"processing_time": 1.266,
"result_size": 14363,
"processing_time": 0.043,
"nfiles": 1,
"nrows": 88,
"nrows": 39,
"source_doc_count": 1,
"result_doc_count": 88
"result_doc_count": 39
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",
Expand Down
Binary file not shown.
Binary file modified transforms/language/doc_chunk/ray/test-data/input/test1.parquet
Binary file not shown.
6 changes: 3 additions & 3 deletions transforms/language/pdf2parquet/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
data-prep-toolkit==0.2.2.dev1
docling-core==2.2.2
docling-ibm-models==2.0.1
docling-core==2.3.0
docling-ibm-models==2.0.3
deepsearch-glm==0.26.1
docling==2.2.1
docling==2.3.0
filetype >=1.2.0, <2.0.0

0 comments on commit e929903

Please sign in to comment.