From e0929781f42958ffe5a68139843ef7f1cb063fac Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Wed, 15 Jan 2025 10:22:48 +0100 Subject: [PATCH] Added tokens/sec measurement, improved example Signed-off-by: Maksym Lysak --- docling/models/smol_docling_model.py | 9 +++- docs/examples/minimal_smol_docling.py | 73 +++++++++++++++------------ 2 files changed, 50 insertions(+), 32 deletions(-) diff --git a/docling/models/smol_docling_model.py b/docling/models/smol_docling_model.py index bcc7eadf..3d48a532 100644 --- a/docling/models/smol_docling_model.py +++ b/docling/models/smol_docling_model.py @@ -63,7 +63,6 @@ class SmolDoclingModel(BasePageModel): else: with TimeRecorder(conv_res, "smolvlm"): assert page.size is not None - start_time = time.time() hi_res_image = page.get_image(scale=2.0) # 144dpi # populate page_tags with predicted doc tags @@ -95,19 +94,27 @@ class SmolDoclingModel(BasePageModel): inputs = {k: v.to(self.device) for k, v in inputs.items()} prompt = prompt.replace("", "") + start_time = time.time() # Call model to generate: generated_ids = self.vlm_model.generate( **inputs, max_new_tokens=4096 ) + generation_time = time.time() - start_time + generated_texts = self.processor.batch_decode( generated_ids, skip_special_tokens=True )[0] + num_tokens = len(generated_ids[0]) generated_texts = generated_texts.replace("Assistant: ", "") page_tags = generated_texts inference_time = time.time() - start_time + tokens_per_second = num_tokens / generation_time + print("") print(f"Page Inference Time: {inference_time:.2f} seconds") + print(f"Tokens/sec: {tokens_per_second:.2f}") + print("") print("Page predictions:") print(page_tags) diff --git a/docs/examples/minimal_smol_docling.py b/docs/examples/minimal_smol_docling.py index 50dbd0dc..cefa8894 100644 --- a/docs/examples/minimal_smol_docling.py +++ b/docs/examples/minimal_smol_docling.py @@ -1,8 +1,11 @@ +import json import os import time from pathlib import Path from urllib.parse import urlparse +import yaml + from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions @@ -11,15 +14,16 @@ from docling.pipeline.vlm_pipeline import VlmPipeline # source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL # source = "tests/data/2305.03393v1-pg9-img.png" -source = "tests/data/2305.03393v1-pg9.pdf" +# source = "tests/data/2305.03393v1-pg9.pdf" # source = "demo_data/page.png" # source = "demo_data/original_tables.pdf" -parsed = urlparse(source) -if parsed.scheme in ("http", "https"): - out_name = os.path.basename(parsed.path) -else: - out_name = os.path.basename(source) +sources = [ + "tests/data/2305.03393v1-pg9-img.png", + # "tests/data/2305.03393v1-pg9.pdf", + # "demo_data/page.png", + # "demo_data/original_tables.pdf", +] pipeline_options = PdfPipelineOptions() pipeline_options.generate_page_images = True @@ -41,34 +45,41 @@ converter = DocumentConverter( } ) -start_time = time.time() -print("============") -print("starting...") -print("============") -print("") +out_path = Path("scratch") +out_path.mkdir(parents=True, exist_ok=True) -result = converter.convert(source) +for source in sources: + start_time = time.time() + print("================================================") + print("Processing... {}".format(source)) + print("================================================") + print("") -print("------------") -print("MD:") -print("------------") -print("") -print(result.document.export_to_markdown()) + res = converter.convert(source) -Path("scratch").mkdir(parents=True, exist_ok=True) -result.document.save_as_html( - filename=Path("scratch/{}.html".format(out_name)), - image_mode=ImageRefMode.REFERENCED, - labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], -) + print("------------------------------------------------") + print("MD:") + print("------------------------------------------------") + print("") + print(res.document.export_to_markdown()) -pg_num = result.document.num_pages() + with (out_path / f"{res.input.file.stem}.html").open("w") as fp: + fp.write(res.document.export_to_html()) -print("") -inference_time = time.time() - start_time -print(f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}") -print("============") + with (out_path / f"{res.input.file.stem}.json").open("w") as fp: + fp.write(json.dumps(res.document.export_to_dict())) + + with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp: + fp.write(yaml.safe_dump(res.document.export_to_dict())) + + pg_num = res.document.num_pages() + + print("") + inference_time = time.time() - start_time + print( + f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}" + ) + +print("================================================") print("done!") -print("============") - -# output: ## Docling Technical Report [...]" +print("================================================")