From f4c1836c96625eb1afb1437081e81a96db1283b1 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Thu, 10 Jul 2025 16:15:54 +0200 Subject: [PATCH] functional working two-stage, need to implement a good prompt now to leverage bounding boxes Signed-off-by: Peter Staar --- .../vlm_models_inline/two_stage_vlm_model.py | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/docling/models/vlm_models_inline/two_stage_vlm_model.py b/docling/models/vlm_models_inline/two_stage_vlm_model.py index 2ef18692..2bf5958f 100644 --- a/docling/models/vlm_models_inline/two_stage_vlm_model.py +++ b/docling/models/vlm_models_inline/two_stage_vlm_model.py @@ -64,7 +64,10 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin): user_prompt = self.vlm_model.get_user_prompt(page=page) prompt = self.formulate_prompt( - user_prompt=user_prompt, clusters=processed_clusters + user_prompt=user_prompt, + clusters=processed_clusters, + image_width=page_image.width, + image_height=page_image.height, ) start_time = time.time() @@ -73,15 +76,44 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin): page_image=page_image, prompt=prompt ) ) - + print("generated-text: \n", generated_text, "\n") page.predictions.vlm_response = VlmPrediction( text=generated_text, generation_time=time.time() - start_time, generated_tokens=generated_tokens, ) + exit(-1) + yield page - def formulate_prompt(self, *, user_prompt: str, clusters: list[Cluster]) -> str: + def formulate_prompt( + self, + *, + user_prompt: str, + clusters: list[Cluster], + image_width: int, + image_height: int, + vlm_width: int = 512, + vlm_height: int = 512, + ) -> str: """Formulate a prompt for the VLM.""" + known_clusters = ["here is a list of unsorted text-blocks:", ""] + for cluster in clusters: + print(" => ", cluster) + + loc_l = f"" + loc_b = f"" + loc_r = f"" + loc_t = f"" + + known_clusters.append( + f"<{cluster.label}>{loc_l}{loc_b}{loc_r}{loc_t}" + ) + + known_clusters.append("") + + user_prompt = "\n".join(known_clusters) + f"\n\n{user_prompt}" + print("user-prompt: ", user_prompt, "\n") + return user_prompt