update doc_chunk with new docling v2

Signed-off-by: Michele Dolfi <[email protected]>
IBM · Oct 30, 2024 · e929903 · e929903
1 parent 7e5ea90
commit e929903
Show file tree

Hide file tree

Showing 13 changed files with 37 additions and 50 deletions.
diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md
@@ -32,7 +32,6 @@ The transform can be tuned with the following parameters.
 | `chunking_type`        | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. |
 | `content_column_name`        | `contents` | Name of the column containing the text to be chunked. |
 | `doc_id_column_name`         | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
-| `dl_min_chunk_len`           | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
 | `chunk_size_tokens`          | `128` | Size of the chunk in tokens for the token text chunker. |
 | `chunk_overlap_tokens`       | `30` | Number of tokens overlapping between chunks for the token text chunker. |
 | `output_chunk_column_name`   | `contents` | Column name to store the chunks in the output table. |

diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_chunk_transform_python"
-version = "0.2.2.dev1"
+version = "0.3.0"
 requires-python = ">=3.10,<3.13"
 description = "chunk documents Python Transform"
 license = {text = "Apache-2.0"}

diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt
@@ -1,3 +1,3 @@
 data-prep-toolkit==0.2.2.dev1
-docling-core==1.7.2
+docling-core==2.3.0
 llama-index-core>=0.11.0,<0.12.0
diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
@@ -13,11 +13,12 @@
 from abc import ABCMeta, abstractmethod
 from typing import Iterator, Optional, Dict, List
 
-from docling_core.types import Document as DLDocument
+from docling_core.types.doc import DoclingDocument
 from llama_index.core.node_parser.text.token import TokenTextSplitter
 from llama_index.core import Document as LIDocument
 from llama_index.core.node_parser import MarkdownNodeParser
 from docling_core.transforms.chunker import HierarchicalChunker
+from docling_core.transforms.chunker.hierarchical_chunker import DocChunk
 
 
 class ChunkingExecutor(metaclass=ABCMeta):
@@ -29,7 +30,6 @@ def chunk(self, content: str) -> Iterator[dict]:
 class DLJsonChunker(ChunkingExecutor):
     def __init__(
         self,
-        min_chunk_len: Optional[int],
         output_chunk_column_name: str,
         output_jsonpath_column_name: str,
         output_pageno_column_name_key: str,
@@ -40,19 +40,19 @@ def __init__(
         self.output_pageno_column_name_key = output_pageno_column_name_key
         self.output_bbox_column_name_key = output_bbox_column_name_key
 
-        chunker_kwargs = dict(include_metadata=True)
-        if min_chunk_len is not None:
-            chunker_kwargs["min_chunk_len"] = min_chunk_len
-        self._chunker = HierarchicalChunker(**chunker_kwargs)
+        self._chunker = HierarchicalChunker()
 
     def chunk(self, content: str) -> Iterator[dict]:
-        doc = DLDocument.model_validate_json(content)
+        doc = DoclingDocument.model_validate_json(content)
         for chunk in self._chunker.chunk(doc):
+            chunk: DocChunk
+            doc_item = chunk.meta.doc_items[0]
+            prov = doc_item.prov[0]
             yield {
                 self.output_chunk_column_name: chunk.text,
-                self.output_jsonpath_column_name: chunk.path,
-                self.output_pageno_column_name_key: chunk.page,
-                self.output_bbox_column_name_key: chunk.bbox,
+                self.output_jsonpath_column_name: doc_item.self_ref,
+                self.output_pageno_column_name_key: prov.page_no,
+                self.output_bbox_column_name_key: prov.bbox.as_tuple(),
             }
 
 

diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py
@@ -26,7 +26,6 @@
 content_column_name_key = "content_column_name"
 doc_id_column_name_key = "doc_id_column_name"
 chunking_type_key = "chunking_type"
-dl_min_chunk_len_key = "dl_min_chunk_len"
 chunk_size_tokens_key = "chunk_size_tokens"
 chunk_overlap_tokens_key = "chunk_overlap_tokens"
 output_chunk_column_name_key = "output_chunk_column_name"
@@ -38,7 +37,6 @@
 content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
 doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}"
 chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
-dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
 output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
 output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}"
 output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
@@ -59,7 +57,6 @@ def __str__(self):
 default_content_column_name = "contents"
 default_doc_id_column_name = "document_id"
 default_chunking_type = chunking_types.DL_JSON
-default_dl_min_chunk_len = None
 default_output_chunk_column_name = "contents"
 default_output_chunk_column_id = "chunk_id"
 default_output_source_doc_id_column_name = "source_document_id"
@@ -95,7 +92,6 @@ def __init__(self, config: dict[str, Any]):
         self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)
 
         # Parameters for Docling JSON chunking
-        self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
         self.output_jsonpath_column_name = config.get(
             output_jsonpath_column_name_key, default_output_jsonpath_column_name
         )
@@ -113,7 +109,6 @@ def __init__(self, config: dict[str, Any]):
         self.chunker: ChunkingExecutor
         if self.chunking_type == chunking_types.DL_JSON:
             self.chunker = DLJsonChunker(
-                min_chunk_len=self.dl_min_chunk_len,
                 output_chunk_column_name=self.output_chunk_column_name,
                 output_jsonpath_column_name=self.output_jsonpath_column_name,
                 output_pageno_column_name_key=self.output_pageno_column_name_key,
@@ -202,11 +197,6 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=default_doc_id_column_name,
             help="Name of the column containing the doc_id to be propagated in the output",
         )
-        parser.add_argument(
-            f"--{dl_min_chunk_len_cli_param}",
-            default=default_dl_min_chunk_len,
-            help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.",
-        )
         parser.add_argument(
             f"--{output_chunk_column_name_cli_param}",
             default=default_output_chunk_column_name,

diff --git a/transforms/language/doc_chunk/python/test-data/expected/metadata.json b/transforms/language/doc_chunk/python/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "doc_chunk",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-10-18 14:05:09",
-    "end_time": "2024-10-18 14:05:11",
+    "start_time": "2024-10-30 18:38:40",
+    "end_time": "2024-10-30 18:38:40",
     "status": "success"
   },
   "code": {
@@ -18,7 +18,6 @@
     "chunking_type": "dl_json",
     "content_column_name": "contents",
     "doc_id_column_name": "document_id",
-    "dl_min_chunk_len": null,
     "output_chunk_column_name": "contents",
     "output_source_doc_id_column_name": "source_document_id",
     "output_jsonpath_column_name": "doc_jsonpath",
@@ -35,22 +34,22 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 27.9,
+    "cpus": 19.5,
     "gpus": 0,
-    "memory": 25.75,
+    "memory": 27.48,
     "object_store": 0,
-    "execution time, min": 0.021
+    "execution time, min": 0.001
   },
   "job_output_stats": {
     "source_files": 1,
-    "source_size": 50276,
+    "source_size": 12073,
     "result_files": 1,
-    "result_size": 31223,
-    "processing_time": 1.266,
+    "result_size": 14363,
+    "processing_time": 0.043,
     "nfiles": 1,
-    "nrows": 88,
+    "nrows": 39,
     "source_doc_count": 1,
-    "result_doc_count": 88
+    "result_doc_count": 39
   },
   "source": {
     "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",

diff --git a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet
diff --git a/transforms/language/doc_chunk/python/test-data/input/test1.parquet b/transforms/language/doc_chunk/python/test-data/input/test1.parquet
diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_chunk_transform_ray"
-version = "0.2.2.dev1"
+version = "0.3.0"
 requires-python = ">=3.10,<3.13"
 description = "chunk documents Ray Transform"
 license = {text = "Apache-2.0"}
@@ -11,7 +11,7 @@ authors = [
     { name = "Christoph Auer", email = "[email protected]" },
 ]
 dependencies = [
-    "dpk-doc-chunk-transform-python==0.2.2.dev1",
+    "dpk-doc-chunk-transform-python==0.3.0",
     "data-prep-toolkit[ray]==0.2.2.dev1",
 ]
 

diff --git a/transforms/language/doc_chunk/ray/test-data/expected/metadata.json b/transforms/language/doc_chunk/ray/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "doc_chunk",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-10-18 14:05:09",
-    "end_time": "2024-10-18 14:05:11",
+    "start_time": "2024-10-30 18:38:40",
+    "end_time": "2024-10-30 18:38:40",
     "status": "success"
   },
   "code": {
@@ -18,7 +18,6 @@
     "chunking_type": "dl_json",
     "content_column_name": "contents",
     "doc_id_column_name": "document_id",
-    "dl_min_chunk_len": null,
     "output_chunk_column_name": "contents",
     "output_source_doc_id_column_name": "source_document_id",
     "output_jsonpath_column_name": "doc_jsonpath",
@@ -35,22 +34,22 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 27.9,
+    "cpus": 19.5,
     "gpus": 0,
-    "memory": 25.75,
+    "memory": 27.48,
     "object_store": 0,
-    "execution time, min": 0.021
+    "execution time, min": 0.001
   },
   "job_output_stats": {
     "source_files": 1,
-    "source_size": 50276,
+    "source_size": 12073,
     "result_files": 1,
-    "result_size": 31223,
-    "processing_time": 1.266,
+    "result_size": 14363,
+    "processing_time": 0.043,
     "nfiles": 1,
-    "nrows": 88,
+    "nrows": 39,
     "source_doc_count": 1,
-    "result_doc_count": 88
+    "result_doc_count": 39
   },
   "source": {
     "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",

diff --git a/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet b/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet
diff --git a/transforms/language/doc_chunk/ray/test-data/input/test1.parquet b/transforms/language/doc_chunk/ray/test-data/input/test1.parquet
diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt
@@ -1,6 +1,6 @@
 data-prep-toolkit==0.2.2.dev1
-docling-core==2.2.2
-docling-ibm-models==2.0.1
+docling-core==2.3.0
+docling-ibm-models==2.0.3
 deepsearch-glm==0.26.1
-docling==2.2.1
+docling==2.3.0
 filetype >=1.2.0, <2.0.0