Exposed "force_backend_text" as pipeline parameter

Signed-off-by: Maksym Lysak <[email protected]>
DS4SD · Jan 16, 2025 · 632a4f4 · 632a4f4
1 parent 3aa2414
commit 632a4f4
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 14 deletions.
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -225,6 +225,10 @@ class PdfPipelineOptions(PipelineOptions):
     artifacts_path: Optional[Union[Path, str]] = None
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
+    force_backend_text: bool = (
+        False  # (To be used with vlms, or other generative models)
+    )
+    # If True, text from backend will be used instead of generated text
 
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: Union[

diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
@@ -42,10 +42,9 @@ def __init__(self, pipeline_options: PdfPipelineOptions):
         super().__init__(pipeline_options)
         self.pipeline_options: PdfPipelineOptions
 
-        # TODO: Move "use_backend_text" to pipeline parameters!
-        # use_backend_text = False - use text that is coming from SmolDocling
-        # use_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss
-        self.use_backend_text = False
+        # force_backend_text = False - use text that is coming from SmolDocling
+        # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss
+        self.force_backend_text = pipeline_options.force_backend_text
 
         if pipeline_options.artifacts_path is None:
             self.artifacts_path = self.download_models_hf()
@@ -324,7 +323,7 @@ def parse_table_content(otsl_content: str) -> TableData:
                 line = line.replace("<doc_tag>", "")
                 if line.startswith("<paragraph>"):
                     prov_item = extract_bounding_box(line)
-                    if self.use_backend_text:
+                    if self.force_backend_text:
                         content = extract_text_from_backend(page, prov_item)
                     else:
                         content = extract_text(line)
@@ -345,7 +344,7 @@ def parse_table_content(otsl_content: str) -> TableData:
                     )
                 elif line.startswith("<title>"):
                     prov_item = extract_bounding_box(line)
-                    if self.use_backend_text:
+                    if self.force_backend_text:
                         content = extract_text_from_backend(page, prov_item)
                     else:
                         content = extract_text(line)
@@ -370,7 +369,7 @@ def parse_table_content(otsl_content: str) -> TableData:
 
                 elif line.startswith("<section-header>"):
                     prov_item = extract_bounding_box(line)
-                    if self.use_backend_text:
+                    if self.force_backend_text:
                         content = extract_text_from_backend(page, prov_item)
                     else:
                         content = extract_text(line)
@@ -403,7 +402,7 @@ def parse_table_content(otsl_content: str) -> TableData:
 
                 elif line.startswith("<footnote>"):
                     prov_item = extract_bounding_box(line)
-                    if self.use_backend_text:
+                    if self.force_backend_text:
                         content = extract_text_from_backend(page, prov_item)
                     else:
                         content = extract_text(line)
@@ -424,7 +423,7 @@ def parse_table_content(otsl_content: str) -> TableData:
 
                 elif line.startswith("<page-header>"):
                     prov_item = extract_bounding_box(line)
-                    if self.use_backend_text:
+                    if self.force_backend_text:
                         content = extract_text_from_backend(page, prov_item)
                     else:
                         content = extract_text(line)
@@ -445,7 +444,7 @@ def parse_table_content(otsl_content: str) -> TableData:
 
                 elif line.startswith("<page-footer>"):
                     prov_item = extract_bounding_box(line)
-                    if self.use_backend_text:
+                    if self.force_backend_text:
                         content = extract_text_from_backend(page, prov_item)
                     else:
                         content = extract_text(line)
@@ -496,7 +495,7 @@ def parse_table_content(otsl_content: str) -> TableData:
                 elif line.startswith("<list>"):
                     prov_item_inst = None
                     prov_item = extract_bounding_box(line)
-                    if self.use_backend_text:
+                    if self.force_backend_text:
                         content = extract_text_from_backend(page, prov_item)
                     else:
                         content = extract_text(line)
@@ -515,7 +514,7 @@ def parse_table_content(otsl_content: str) -> TableData:
                 elif line.startswith("<caption>"):
                     prov_item_inst = None
                     prov_item = extract_bounding_box(line)
-                    if self.use_backend_text:
+                    if self.force_backend_text:
                         content = extract_text_from_backend(page, prov_item)
                     else:
                         content = extract_text(line)
@@ -533,7 +532,7 @@ def parse_table_content(otsl_content: str) -> TableData:
                 elif line.startswith("<checkbox-unselected>"):
                     prov_item_inst = None
                     prov_item = extract_bounding_box(line)
-                    if self.use_backend_text:
+                    if self.force_backend_text:
                         content = extract_text_from_backend(page, prov_item)
                     else:
                         content = extract_text(line)
@@ -552,7 +551,7 @@ def parse_table_content(otsl_content: str) -> TableData:
                 elif line.startswith("<checkbox-selected>"):
                     prov_item_inst = None
                     prov_item = extract_bounding_box(line)
-                    if self.use_backend_text:
+                    if self.force_backend_text:
                         content = extract_text_from_backend(page, prov_item)
                     else:
                         content = extract_text(line)

diff --git a/docs/examples/minimal_smol_docling.py b/docs/examples/minimal_smol_docling.py
@@ -21,6 +21,9 @@
 
 pipeline_options = PdfPipelineOptions()
 pipeline_options.generate_page_images = True
+pipeline_options.force_backend_text = (
+    False  # If True, text from backend will be used instead of generated text
+)
 pipeline_options.artifacts_path = "model_artifacts"
 
 from docling_core.types.doc import DocItemLabel, ImageRefMode