functional working two-stage, need to implement a good prompt now to leverage bounding boxes

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-25 03:24:59 +00:00 · 2025-07-10 16:15:54 +02:00 · 2025-07-10 16:15:54 +02:00 · f4c1836c96
commit f4c1836c96
parent b2d5c783ae
1 changed files with 35 additions and 3 deletions
--- a/docling/models/vlm_models_inline/two_stage_vlm_model.py
+++ b/docling/models/vlm_models_inline/two_stage_vlm_model.py
@ -64,7 +64,10 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):

                    user_prompt = self.vlm_model.get_user_prompt(page=page)
                    prompt = self.formulate_prompt(
-                        user_prompt=user_prompt, clusters=processed_clusters
+                        user_prompt=user_prompt,
+                        clusters=processed_clusters,
+                        image_width=page_image.width,
+                        image_height=page_image.height,
                    )

                    start_time = time.time()
@ -73,15 +76,44 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
                            page_image=page_image, prompt=prompt
                        )
                    )
-
+                    print("generated-text: \n", generated_text, "\n")
                    page.predictions.vlm_response = VlmPrediction(
                        text=generated_text,
                        generation_time=time.time() - start_time,
                        generated_tokens=generated_tokens,
                    )
+                    exit(-1)
+
                yield page

-    def formulate_prompt(self, *, user_prompt: str, clusters: list[Cluster]) -> str:
+    def formulate_prompt(
+        self,
+        *,
+        user_prompt: str,
+        clusters: list[Cluster],
+        image_width: int,
+        image_height: int,
+        vlm_width: int = 512,
+        vlm_height: int = 512,
+    ) -> str:
        """Formulate a prompt for the VLM."""

+        known_clusters = ["here is a list of unsorted text-blocks:", "<doctags>"]
+        for cluster in clusters:
+            print(" => ", cluster)
+
+            loc_l = f"<loc_{int(vlm_width * cluster.bbox.l / image_width)}>"
+            loc_b = f"<loc_{int(vlm_height * cluster.bbox.b / image_height)}>"
+            loc_r = f"<loc_{int(vlm_width * cluster.bbox.r / image_width)}>"
+            loc_t = f"<loc_{int(vlm_height * cluster.bbox.t / image_height)}>"
+
+            known_clusters.append(
+                f"<{cluster.label}>{loc_l}{loc_b}{loc_r}{loc_t}</{cluster.label}>"
+            )
+
+        known_clusters.append("</doctags>")
+
+        user_prompt = "\n".join(known_clusters) + f"\n\n{user_prompt}"
+        print("user-prompt: ", user_prompt, "\n")
+
        return user_prompt