From 0904dbb95a929e5788c0e1c6e560dca535118dce Mon Sep 17 00:00:00 2001 From: ElHachem02 Date: Wed, 3 Dec 2025 12:59:31 +0100 Subject: [PATCH] feat: update inference code to shuffle layout elements and discard initial prompt Signed-off-by: ElHachem02 --- .../pipeline/threaded_layout_vlm_pipeline.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py index 92c0c104..e64cb201 100644 --- a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +++ b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py @@ -19,6 +19,8 @@ from PIL import Image as PILImage if TYPE_CHECKING: from docling_core.types.doc.page import SegmentedPage +import random + from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import ConversionStatus, Page @@ -84,13 +86,16 @@ class ThreadedLayoutVlmPipeline(BasePipeline): class LayoutAwareVlmOptions(type(base_vlm_options)): # type: ignore[misc] def build_prompt( self, - page: Optional[SegmentedPage], *, _internal_page: Optional[Page] = None, ) -> str: base_prompt = self.prompt augmented_prompt = base_prompt + # Only augment convert to docling base prompts + if base_prompt != "Convert this page to docling.": + return base_prompt + # In this layout-aware pipeline, _internal_page is always provided if _internal_page is None: return base_prompt @@ -111,6 +116,10 @@ class ThreadedLayoutVlmPipeline(BasePipeline): label=cluster.label ) + if tag_name == DocumentToken.TABLE: + print("Found a table!") + tag_name = "otsl" + # Convert bbox to tuple and get location tokens bbox_tuple = cluster.bbox.as_tuple() location_tokens = DocumentToken.get_location( @@ -124,13 +133,17 @@ class ThreadedLayoutVlmPipeline(BasePipeline): layout_elements.append(xml_element) if layout_elements: + # Shuffle elements + random.shuffle(layout_elements) + # Join elements with newlines and wrap in layout tags layout_xml = ( "" + "\n".join(layout_elements) + "" ) layout_injection = f"{layout_xml}" - augmented_prompt = base_prompt + layout_injection + augmented_prompt = layout_injection + print(f"final prompt is {augmented_prompt}") _log.debug( "Enhanced Prompt with Layout Info: %s\n", augmented_prompt