Updated vlm pipeline assembly and smol docling model code to support updated doctags

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2025-01-17 17:54:55 +01:00
parent f6d123a01c
commit 0fe12d819a
3 changed files with 29 additions and 14 deletions
--- a/docling/models/smol_docling_model.py
+++ b/docling/models/smol_docling_model.py
@@ -97,16 +97,21 @@ class SmolDoclingModel(BasePageModel):
                    start_time = time.time()
                    # Call model to generate:
                    generated_ids = self.vlm_model.generate(
-                        **inputs, max_new_tokens=4096
+                        **inputs, max_new_tokens=4096, use_cache=True
                    )

                    generation_time = time.time() - start_time

                    generated_texts = self.processor.batch_decode(
-                        generated_ids, skip_special_tokens=True
+                        generated_ids, skip_special_tokens=False
                    )[0]
                    num_tokens = len(generated_ids[0])
-                    generated_texts = generated_texts.replace("Assistant: ", "")
+                    # DELETE NOISE BEFORE "Assistant: "
+                    starting_point = "Assistant: "
+                    generated_texts = generated_texts[
+                        generated_texts.index(starting_point) + len(starting_point) :
+                    ]
+                    # generated_texts = generated_texts.replace("Assistant: ", "")
                    page_tags = generated_texts

                    inference_time = time.time() - start_time
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -36,7 +36,8 @@ _log = logging.getLogger(__name__)


 class VlmPipeline(PaginatedPipeline):
-    _smol_vlm_path = "SmolDocling-0.0.2"
+    # _smol_vlm_path = "SmolDocling-0.0.2"
+    _smol_vlm_path = "SmolDocling_2.7_DT_0.7"

    def __init__(self, pipeline_options: PdfPipelineOptions):
        super().__init__(pipeline_options)
@@ -207,7 +208,9 @@ class VlmPipeline(PaginatedPipeline):
                        right_offset = 2

                    # Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
-                    next_right_cell = texts[i + right_offset]
+                    next_right_cell = ""
+                    if i + right_offset < len(texts):
+                        next_right_cell = texts[i + right_offset]

                    next_bottom_cell = ""
                    if r_idx + 1 < len(split_row_tokens):
@@ -367,7 +370,7 @@ class VlmPipeline(PaginatedPipeline):
                        ),
                    )

-                elif line.startswith("<section-header>"):
+                elif line.startswith("<section_header_level_1>"):
                    prov_item = extract_bounding_box(line)
                    if self.force_backend_text:
                        content = extract_text_from_backend(page, prov_item)
@@ -421,7 +424,7 @@ class VlmPipeline(PaginatedPipeline):
                        ),
                    )

-                elif line.startswith("<page-header>"):
+                elif line.startswith("<page_header>"):
                    prov_item = extract_bounding_box(line)
                    if self.force_backend_text:
                        content = extract_text_from_backend(page, prov_item)
@@ -442,7 +445,7 @@ class VlmPipeline(PaginatedPipeline):
                        ),
                    )

-                elif line.startswith("<page-footer>"):
+                elif line.startswith("<page_footer>"):
                    prov_item = extract_bounding_box(line)
                    if self.force_backend_text:
                        content = extract_text_from_backend(page, prov_item)
@@ -463,7 +466,7 @@ class VlmPipeline(PaginatedPipeline):
                        ),
                    )

-                elif line.startswith("<figure>"):
+                elif line.startswith("<picture>"):
                    bbox = extract_bounding_box(line)
                    if bbox:
                        bounding_boxes.append((bbox, "yellow"))
@@ -492,7 +495,7 @@ class VlmPipeline(PaginatedPipeline):
                                    bbox=bbox, charspan=(0, 0), page_no=page_no
                                ),
                            )
-                elif line.startswith("<list>"):
+                elif line.startswith("<list_item>"):
                    prov_item_inst = None
                    prov_item = extract_bounding_box(line)
                    if self.force_backend_text:
@@ -529,7 +532,7 @@ class VlmPipeline(PaginatedPipeline):
                        parent=current_group,
                        prov=prov_item_inst if prov_item_inst else None,
                    )
-                elif line.startswith("<checkbox-unselected>"):
+                elif line.startswith("<checkbox_unselected>"):
                    prov_item_inst = None
                    prov_item = extract_bounding_box(line)
                    if self.force_backend_text:
@@ -548,7 +551,7 @@ class VlmPipeline(PaginatedPipeline):
                        prov=prov_item_inst if prov_item_inst else None,
                    )

-                elif line.startswith("<checkbox-selected>"):
+                elif line.startswith("<checkbox_selected>"):
                    prov_item_inst = None
                    prov_item = extract_bounding_box(line)
                    if self.force_backend_text:
--- a/docs/examples/minimal_smol_docling.py
+++ b/docs/examples/minimal_smol_docling.py
@@ -13,6 +13,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

 sources = [
+    # "https://arxiv.org/pdf/2408.09869"
    # "tests/data/2305.03393v1-pg9-img.png",
    "tests/data/2305.03393v1-pg9.pdf",
    # "demo_data/page.png",
@@ -60,8 +61,14 @@ for source in sources:
    print("")
    print(res.document.export_to_markdown())

-    with (out_path / f"{res.input.file.stem}.html").open("w") as fp:
-        fp.write(res.document.export_to_html())
+    # with (out_path / f"{res.input.file.stem}.html").open("w") as fp:
+    #     fp.write(res.document.export_to_html())
+
+    res.document.save_as_html(
+        filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
+        image_mode=ImageRefMode.REFERENCED,
+        labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
+    )

    with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
        fp.write(json.dumps(res.document.export_to_dict()))