From 0904dbb95a929e5788c0e1c6e560dca535118dce Mon Sep 17 00:00:00 2001
From: ElHachem02 <peterelhachem02@gmail.com>
Date: Wed, 3 Dec 2025 12:59:31 +0100
Subject: [PATCH] feat: update inference code to shuffle layout elements and
 discard initial prompt

Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>
---
 .../pipeline/threaded_layout_vlm_pipeline.py    | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
index 92c0c104..e64cb201 100644
--- a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
+++ b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
@@ -19,6 +19,8 @@ from PIL import Image as PILImage
 if TYPE_CHECKING:
     from docling_core.types.doc.page import SegmentedPage
 
+import random
+
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, Page
@@ -84,13 +86,16 @@ class ThreadedLayoutVlmPipeline(BasePipeline):
         class LayoutAwareVlmOptions(type(base_vlm_options)):  # type: ignore[misc]
             def build_prompt(
                 self,
-                page: Optional[SegmentedPage],
                 *,
                 _internal_page: Optional[Page] = None,
             ) -> str:
                 base_prompt = self.prompt
                 augmented_prompt = base_prompt
 
+                # Only augment convert to docling base prompts
+                if base_prompt != "Convert this page to docling.":
+                    return base_prompt
+
                 # In this layout-aware pipeline, _internal_page is always provided
                 if _internal_page is None:
                     return base_prompt
@@ -111,6 +116,10 @@ class ThreadedLayoutVlmPipeline(BasePipeline):
                             label=cluster.label
                         )
 
+                        if tag_name == DocumentToken.TABLE:
+                            print("Found a table!")
+                            tag_name = "otsl"
+
                         # Convert bbox to tuple and get location tokens
                         bbox_tuple = cluster.bbox.as_tuple()
                         location_tokens = DocumentToken.get_location(
@@ -124,13 +133,17 @@ class ThreadedLayoutVlmPipeline(BasePipeline):
                         layout_elements.append(xml_element)
 
                     if layout_elements:
+                        # Shuffle elements
+                        random.shuffle(layout_elements)
+
                         # Join elements with newlines and wrap in layout tags
                         layout_xml = (
                             "<layout>" + "\n".join(layout_elements) + "</layout>"
                         )
                         layout_injection = f"{layout_xml}"
 
-                        augmented_prompt = base_prompt + layout_injection
+                        augmented_prompt = layout_injection
+                        print(f"final prompt is {augmented_prompt}")
 
                     _log.debug(
                         "Enhanced Prompt with Layout Info: %s\n", augmented_prompt