Skip to content

Commit

Permalink
Added capability for vlm_pipeline to grab text from preconfigured bac…
Browse files Browse the repository at this point in the history
…kend

Signed-off-by: Maksym Lysak <[email protected]>
  • Loading branch information
Maksym Lysak committed Jan 16, 2025
1 parent 450974c commit 3aa2414
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 44 deletions.
110 changes: 74 additions & 36 deletions docling/pipeline/vlm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
TableItem,
)
from docling_core.types.doc.tokens import DocumentToken, TableToken
from PIL.Image import Image

from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
Expand All @@ -43,6 +42,11 @@ def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: PdfPipelineOptions

# TODO: Move "use_backend_text" to pipeline parameters!
# use_backend_text = False - use text that is coming from SmolDocling
# use_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss
self.use_backend_text = False

if pipeline_options.artifacts_path is None:
self.artifacts_path = self.download_models_hf()
else:
Expand Down Expand Up @@ -94,17 +98,7 @@ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):

# Read and concatenate the page doctags:
# document_tags = ""
page_tags = []
page_images = []

for page in conv_res.pages:
if page.predictions.doctags is not None:
page_tags.append(page.predictions.doctags.tag_string)
page_images.append(page.image)

conv_res.document = self._turn_tags_into_doc(page_tags, page_images)
conv_res.document = self._turn_tags_into_doc(conv_res.pages)

# Generate images of the requested element types
if (
Expand Down Expand Up @@ -140,9 +134,21 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:

return conv_res

def _turn_tags_into_doc(
self, full_doc_xml_content: list[str], pil_images: list[Image | None]
) -> DoclingDocument:
def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:

def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
# Convert bounding box normalized to 0-100 into page coordinates for cropping
text = ""
if bbox:
if page.size:
bbox.l = bbox.l * page.size.width
bbox.t = bbox.t * page.size.height
bbox.r = bbox.r * page.size.width
bbox.b = bbox.b * page.size.height
if page._backend:
text = page._backend.get_text_in_rect(bbox)
return text

def extract_text(tag_content: str) -> str:
return re.sub(r"<.*?>", "", tag_content).strip()

Expand Down Expand Up @@ -206,7 +212,8 @@ def count_down(tokens, c_idx, r_idx, which_tokens):

next_bottom_cell = ""
if r_idx + 1 < len(split_row_tokens):
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
if c_idx < len(split_row_tokens[r_idx + 1]):
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]

if next_right_cell in [
TableToken.OTSL_LCEL.value,
Expand Down Expand Up @@ -296,12 +303,16 @@ def parse_table_content(otsl_content: str) -> TableData:
doc = DoclingDocument(name="Example Document")
current_group = None

for pg_idx, xml_content in enumerate(full_doc_xml_content):
pil_image = pil_images[pg_idx]
for pg_idx, page in enumerate(pages):
xml_content = ""
if page.predictions.doctags:
xml_content = page.predictions.doctags.tag_string
pil_image = page.image
page_no = pg_idx + 1

if pil_image:
pg_width, pg_height = pil_image.size
if page.size:
pg_width = page.size.width
pg_height = page.size.height
size = Size(width=pg_width, height=pg_height)
parent_page = doc.add_page(page_no=page_no, size=size)

Expand All @@ -312,16 +323,19 @@ def parse_table_content(otsl_content: str) -> TableData:
line = line.strip()
line = line.replace("<doc_tag>", "")
if line.startswith("<paragraph>"):
content = extract_text(line)
prov_item = extract_bounding_box(line)
if self.use_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)

if prov_item:
bounding_boxes.append((prov_item, "red"))
doc.add_text(
label=DocItemLabel.PARAGRAPH,
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
)
Expand All @@ -330,8 +344,12 @@ def parse_table_content(otsl_content: str) -> TableData:
),
)
elif line.startswith("<title>"):
content = extract_text(line)
prov_item = extract_bounding_box(line)
if self.use_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)

if prov_item:
bounding_boxes.append((prov_item, "blue"))
current_group = doc.add_group(
Expand All @@ -342,7 +360,6 @@ def parse_table_content(otsl_content: str) -> TableData:
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
)
Expand All @@ -352,8 +369,12 @@ def parse_table_content(otsl_content: str) -> TableData:
)

elif line.startswith("<section-header>"):
content = extract_text(line)
prov_item = extract_bounding_box(line)
if self.use_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)

if prov_item:
bounding_boxes.append((prov_item, "green"))
current_group = doc.add_group(
Expand All @@ -364,7 +385,6 @@ def parse_table_content(otsl_content: str) -> TableData:
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
)
Expand All @@ -382,16 +402,18 @@ def parse_table_content(otsl_content: str) -> TableData:
doc.add_table(data=table_data, parent=current_group)

elif line.startswith("<footnote>"):
content = extract_text(line)
prov_item = extract_bounding_box(line)
if self.use_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
if prov_item:
bounding_boxes.append((prov_item, "orange"))
doc.add_text(
label=DocItemLabel.FOOTNOTE,
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
)
Expand All @@ -401,16 +423,18 @@ def parse_table_content(otsl_content: str) -> TableData:
)

elif line.startswith("<page-header>"):
content = extract_text(line)
prov_item = extract_bounding_box(line)
if self.use_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
if prov_item:
bounding_boxes.append((prov_item, "purple"))
doc.add_text(
label=DocItemLabel.PAGE_HEADER,
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
)
Expand All @@ -420,16 +444,18 @@ def parse_table_content(otsl_content: str) -> TableData:
)

elif line.startswith("<page-footer>"):
content = extract_text(line)
prov_item = extract_bounding_box(line)
if self.use_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
if prov_item:
bounding_boxes.append((prov_item, "cyan"))
doc.add_text(
label=DocItemLabel.PAGE_FOOTER,
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
)
Expand Down Expand Up @@ -468,9 +494,12 @@ def parse_table_content(otsl_content: str) -> TableData:
),
)
elif line.startswith("<list>"):
content = extract_text(line)
prov_item_inst = None
prov_item = extract_bounding_box(line)
if self.use_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
if prov_item:
bounding_boxes.append((prov_item, "brown"))
prov_item_inst = ProvenanceItem(
Expand All @@ -484,9 +513,12 @@ def parse_table_content(otsl_content: str) -> TableData:
)

elif line.startswith("<caption>"):
content = extract_text(line)
prov_item_inst = None
prov_item = extract_bounding_box(line)
if self.use_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
if prov_item:
bounding_boxes.append((prov_item, "magenta"))
prov_item_inst = ProvenanceItem(
Expand All @@ -499,9 +531,12 @@ def parse_table_content(otsl_content: str) -> TableData:
prov=prov_item_inst if prov_item_inst else None,
)
elif line.startswith("<checkbox-unselected>"):
content = extract_text(line)
prov_item_inst = None
prov_item = extract_bounding_box(line)
if self.use_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
if prov_item:
bounding_boxes.append((prov_item, "gray"))
prov_item_inst = ProvenanceItem(
Expand All @@ -515,9 +550,12 @@ def parse_table_content(otsl_content: str) -> TableData:
)

elif line.startswith("<checkbox-selected>"):
content = extract_text(line)
prov_item_inst = None
prov_item = extract_bounding_box(line)
if self.use_backend_text:
content = extract_text_from_backend(page, prov_item)
else:
content = extract_text(line)
if prov_item:
bounding_boxes.append((prov_item, "black"))
prov_item_inst = ProvenanceItem(
Expand Down
10 changes: 2 additions & 8 deletions docs/examples/minimal_smol_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,9 @@
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline

# source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
# source = "tests/data/2305.03393v1-pg9-img.png"
# source = "tests/data/2305.03393v1-pg9.pdf"
# source = "demo_data/page.png"
# source = "demo_data/original_tables.pdf"

sources = [
"tests/data/2305.03393v1-pg9-img.png",
# "tests/data/2305.03393v1-pg9.pdf",
# "tests/data/2305.03393v1-pg9-img.png",
"tests/data/2305.03393v1-pg9.pdf",
# "demo_data/page.png",
# "demo_data/original_tables.pdf",
]
Expand Down

0 comments on commit 3aa2414

Please sign in to comment.