From 450974ca2d41ebada087a3a37ef8b6a17c613c0e Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Wed, 15 Jan 2025 10:22:48 +0100 Subject: [PATCH] Added tokens/sec measurement, improved example Signed-off-by: Maksym Lysak --- docling/models/smol_docling_model.py | 9 ++- docs/examples/minimal_smol_docling.py | 79 +++++++++++++++------------ 2 files changed, 53 insertions(+), 35 deletions(-) diff --git a/docling/models/smol_docling_model.py b/docling/models/smol_docling_model.py index bcc7eadf..3d48a532 100644 --- a/docling/models/smol_docling_model.py +++ b/docling/models/smol_docling_model.py @@ -63,7 +63,6 @@ def __call__( else: with TimeRecorder(conv_res, "smolvlm"): assert page.size is not None - start_time = time.time() hi_res_image = page.get_image(scale=2.0) # 144dpi # populate page_tags with predicted doc tags @@ -95,19 +94,27 @@ def __call__( inputs = {k: v.to(self.device) for k, v in inputs.items()} prompt = prompt.replace("", "") + start_time = time.time() # Call model to generate: generated_ids = self.vlm_model.generate( **inputs, max_new_tokens=4096 ) + generation_time = time.time() - start_time + generated_texts = self.processor.batch_decode( generated_ids, skip_special_tokens=True )[0] + num_tokens = len(generated_ids[0]) generated_texts = generated_texts.replace("Assistant: ", "") page_tags = generated_texts inference_time = time.time() - start_time + tokens_per_second = num_tokens / generation_time + print("") print(f"Page Inference Time: {inference_time:.2f} seconds") + print(f"Tokens/sec: {tokens_per_second:.2f}") + print("") print("Page predictions:") print(page_tags) diff --git a/docs/examples/minimal_smol_docling.py b/docs/examples/minimal_smol_docling.py index 50dbd0dc..cefa8894 100644 --- a/docs/examples/minimal_smol_docling.py +++ b/docs/examples/minimal_smol_docling.py @@ -1,8 +1,11 @@ +import json import os import time from pathlib import Path from urllib.parse import urlparse +import yaml + from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions @@ -11,15 +14,16 @@ # source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL # source = "tests/data/2305.03393v1-pg9-img.png" -source = "tests/data/2305.03393v1-pg9.pdf" +# source = "tests/data/2305.03393v1-pg9.pdf" # source = "demo_data/page.png" # source = "demo_data/original_tables.pdf" -parsed = urlparse(source) -if parsed.scheme in ("http", "https"): - out_name = os.path.basename(parsed.path) -else: - out_name = os.path.basename(source) +sources = [ + "tests/data/2305.03393v1-pg9-img.png", + # "tests/data/2305.03393v1-pg9.pdf", + # "demo_data/page.png", + # "demo_data/original_tables.pdf", +] pipeline_options = PdfPipelineOptions() pipeline_options.generate_page_images = True @@ -41,34 +45,41 @@ } ) -start_time = time.time() -print("============") -print("starting...") -print("============") -print("") - -result = converter.convert(source) - -print("------------") -print("MD:") -print("------------") -print("") -print(result.document.export_to_markdown()) - -Path("scratch").mkdir(parents=True, exist_ok=True) -result.document.save_as_html( - filename=Path("scratch/{}.html".format(out_name)), - image_mode=ImageRefMode.REFERENCED, - labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], -) +out_path = Path("scratch") +out_path.mkdir(parents=True, exist_ok=True) -pg_num = result.document.num_pages() +for source in sources: + start_time = time.time() + print("================================================") + print("Processing... {}".format(source)) + print("================================================") + print("") -print("") -inference_time = time.time() - start_time -print(f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}") -print("============") -print("done!") -print("============") + res = converter.convert(source) + + print("------------------------------------------------") + print("MD:") + print("------------------------------------------------") + print("") + print(res.document.export_to_markdown()) + + with (out_path / f"{res.input.file.stem}.html").open("w") as fp: + fp.write(res.document.export_to_html()) -# output: ## Docling Technical Report [...]" + with (out_path / f"{res.input.file.stem}.json").open("w") as fp: + fp.write(json.dumps(res.document.export_to_dict())) + + with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp: + fp.write(yaml.safe_dump(res.document.export_to_dict())) + + pg_num = res.document.num_pages() + + print("") + inference_time = time.time() - start_time + print( + f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}" + ) + +print("================================================") +print("done!") +print("================================================")