Skip to content

Commit

Permalink
Exposed "force_backend_text" as pipeline parameter
Browse files Browse the repository at this point in the history
Signed-off-by: Maksym Lysak <[email protected]>
  • Loading branch information
Maksym Lysak committed Jan 16, 2025
1 parent 3aa2414 commit 632a4f4
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 14 deletions.
4 changes: 4 additions & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,10 @@ class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text

table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[
Expand Down
27 changes: 13 additions & 14 deletions docling/pipeline/vlm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,9 @@ def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: PdfPipelineOptions

# TODO: Move "use_backend_text" to pipeline parameters!
# use_backend_text = False - use text that is coming from SmolDocling
# use_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss
self.use_backend_text = False
# force_backend_text = False - use text that is coming from SmolDocling
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss
self.force_backend_text = pipeline_options.force_backend_text

if pipeline_options.artifacts_path is None:
self.artifacts_path = self.download_models_hf()
Expand Down Expand Up @@ -324,7 +323,7 @@ def parse_table_content(otsl_content: str) -> TableData:
line = line.replace("<doc_tag>", "")
if line.startswith("<paragraph>"):
prov_item = extract_bounding_box(line)
if self.use_backend_text:
if self.force_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
Expand All @@ -345,7 +344,7 @@ def parse_table_content(otsl_content: str) -> TableData:
)
elif line.startswith("<title>"):
prov_item = extract_bounding_box(line)
if self.use_backend_text:
if self.force_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
Expand All @@ -370,7 +369,7 @@ def parse_table_content(otsl_content: str) -> TableData:

elif line.startswith("<section-header>"):
prov_item = extract_bounding_box(line)
if self.use_backend_text:
if self.force_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
Expand Down Expand Up @@ -403,7 +402,7 @@ def parse_table_content(otsl_content: str) -> TableData:

elif line.startswith("<footnote>"):
prov_item = extract_bounding_box(line)
if self.use_backend_text:
if self.force_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
Expand All @@ -424,7 +423,7 @@ def parse_table_content(otsl_content: str) -> TableData:

elif line.startswith("<page-header>"):
prov_item = extract_bounding_box(line)
if self.use_backend_text:
if self.force_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
Expand All @@ -445,7 +444,7 @@ def parse_table_content(otsl_content: str) -> TableData:

elif line.startswith("<page-footer>"):
prov_item = extract_bounding_box(line)
if self.use_backend_text:
if self.force_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
Expand Down Expand Up @@ -496,7 +495,7 @@ def parse_table_content(otsl_content: str) -> TableData:
elif line.startswith("<list>"):
prov_item_inst = None
prov_item = extract_bounding_box(line)
if self.use_backend_text:
if self.force_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
Expand All @@ -515,7 +514,7 @@ def parse_table_content(otsl_content: str) -> TableData:
elif line.startswith("<caption>"):
prov_item_inst = None
prov_item = extract_bounding_box(line)
if self.use_backend_text:
if self.force_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
Expand All @@ -533,7 +532,7 @@ def parse_table_content(otsl_content: str) -> TableData:
elif line.startswith("<checkbox-unselected>"):
prov_item_inst = None
prov_item = extract_bounding_box(line)
if self.use_backend_text:
if self.force_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
Expand All @@ -552,7 +551,7 @@ def parse_table_content(otsl_content: str) -> TableData:
elif line.startswith("<checkbox-selected>"):
prov_item_inst = None
prov_item = extract_bounding_box(line)
if self.use_backend_text:
if self.force_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
Expand Down
3 changes: 3 additions & 0 deletions docs/examples/minimal_smol_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@

pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True
pipeline_options.force_backend_text = (
False # If True, text from backend will be used instead of generated text
)
pipeline_options.artifacts_path = "model_artifacts"

from docling_core.types.doc import DocItemLabel, ImageRefMode
Expand Down

0 comments on commit 632a4f4

Please sign in to comment.