From 5dffd10eaef7c8965e3cdf682a3e5b101139b092 Mon Sep 17 00:00:00 2001
From: Maksym Lysak <mly@zurich.ibm.com>
Date: Tue, 18 Mar 2025 10:14:16 +0100
Subject: [PATCH] Added support for force_backend_text parameter

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
---
 docling/pipeline/vlm_pipeline.py | 38 +++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index aa89c415..6e73741b 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -12,6 +12,7 @@ from docling_core.types.doc import (  # DocItemLabel,; DoclingDocument,; GroupLa
     DocItem,
     ImageRef,
     PictureItem,
+    TextItem,
 )
 from docling_core.types.doc.document import DocTagsDocument
 from docling_core.types.doc.tokens import DocumentToken, TableToken
@@ -87,6 +88,19 @@ class VlmPipeline(PaginatedPipeline):
 
         return page
 
+    def extract_text_from_backend(self, page: Page, bbox: BoundingBox | None) -> str:
+        # Convert bounding box normalized to 0-100 into page coordinates for cropping
+        text = ""
+        if bbox:
+            if page.size:
+                # bbox.l = bbox.l * page.size.width
+                # bbox.t = bbox.t * page.size.height
+                # bbox.r = bbox.r * page.size.width
+                # bbox.b = bbox.b * page.size.height
+                if page._backend:
+                    text = page._backend.get_text_in_rect(bbox)
+        return text
+
     def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
         with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
 
@@ -112,9 +126,27 @@ class VlmPipeline(PaginatedPipeline):
                     doctags_list_c, image_list_c
                 )
                 conv_res.document.load_from_doctags(doctags_doc)
-                # USE THIS TO FORCE BACKEND TEXT
-                # if self.force_backend_text:
-                #     text_content = extract_text_from_backend(page, bbox)
+
+                # If forced backend text, replace model predicted text with backend one
+                if page.size:
+                    if self.force_backend_text:
+                        scale = self.pipeline_options.images_scale
+                        for element, _level in conv_res.document.iterate_items():
+                            if (
+                                not isinstance(element, TextItem)
+                                or len(element.prov) == 0
+                            ):
+                                continue
+                            crop_bbox = (
+                                element.prov[0]
+                                .bbox.scaled(scale=scale)
+                                .to_top_left_origin(
+                                    page_height=page.size.height * scale
+                                )
+                            )
+                            txt = self.extract_text_from_backend(page, crop_bbox)
+                            element.text = txt
+                            element.orig = txt
             elif (
                 self.pipeline_options.vlm_options.response_format
                 == ResponseFormat.MARKDOWN