From 069378aefbd53d26ff7b16a7e479bbbdb5a081a6 Mon Sep 17 00:00:00 2001
From: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>
Date: Tue, 13 May 2025 14:31:33 +0530
Subject: [PATCH] Update docling_parse_v4_backend.py

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>
---
 docling/backend/docling_parse_v4_backend.py | 333 +++++---------------
 1 file changed, 81 insertions(+), 252 deletions(-)

diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py
index 1ac199bf..3e59f123 100644
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@@ -1,11 +1,8 @@
 import logging
 from collections.abc import Iterable
-from functools import lru_cache
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Optional, Union, List, Dict
-from concurrent.futures import ThreadPoolExecutor
-import time
+from typing import TYPE_CHECKING, Optional, Union
 
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -23,153 +20,81 @@ if TYPE_CHECKING:
 
 _log = logging.getLogger(__name__)
 
-# Constants
-AREA_THRESHOLD = 0  # Threshold for bitmap area processing
-OVERLAP_THRESHOLD = 0.5  # Threshold for text cell overlap
-IMAGE_SCALE_FACTOR = 1.5  # Scale factor for rendering images
-MAX_WORKERS = 4  # Maximum number of worker threads for parallel processing
-PAGE_CACHE_SIZE = 5  # Number of pages to keep in memory
-
 
 class DoclingParseV4PageBackend(PdfPageBackend):
     def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
         self._ppage = page_obj
         self._dpage = parsed_page
         self.valid = parsed_page is not None
-        self._page_size: Optional[Size] = None
-        self._textline_cells_top_left: Optional[List[TextCell]] = None
-        self._creation_time = time.time()
-        self._last_access_time = self._creation_time
 
     def is_valid(self) -> bool:
-        self._last_access_time = time.time()
         return self.valid
 
-    @lru_cache(maxsize=1)
-    def get_size(self) -> Size:
-        """Get page size with caching for better performance."""
-        self._last_access_time = time.time()
-        if self._page_size is None:
-            with pypdfium2_lock:
-                self._page_size = Size(
-                    width=self._ppage.get_width(), height=self._ppage.get_height()
-                )
-        return self._page_size
-
-    def _get_textline_cells_top_left(self) -> List[TextCell]:
-        """Get text cells converted to top-left origin (with caching)."""
-        self._last_access_time = time.time()
-        if self._textline_cells_top_left is None:
-            page_size = self.get_size()
-            self._textline_cells_top_left = []
-            
-            for cell in self._dpage.textline_cells:
-                # Create a copy of the cell with top-left origin
-                cell_copy = cell.model_copy(deep=True)
-                cell_copy.rect = cell_copy.rect.to_top_left_origin(page_size.height)
-                self._textline_cells_top_left.append(cell_copy)
-                
-        return self._textline_cells_top_left
-
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        """Extract text from cells that overlap with the given bounding box."""
-        self._last_access_time = time.time()
-        if not self.valid:
-            return ""
-            
-        text_pieces = []
+        # Find intersecting cells on the page
+        text_piece = ""
         page_size = self.get_size()
-        scale = 1
 
-        # Ensure bbox is in top-left origin
-        if bbox.coord_origin != CoordOrigin.TOPLEFT:
-            bbox = bbox.to_top_left_origin(page_height=page_size.height)
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+
+        for i, cell in enumerate(self._dpage.textline_cells):
+            cell_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page_size.height)
+                .scaled(scale)
+            )
 
-        for cell in self._get_textline_cells_top_left():
-            cell_bbox = cell.rect.to_bounding_box().scaled(scale)
-            
-            # Calculate intersection area
             overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
-            
-            if overlap_frac > OVERLAP_THRESHOLD:
-                text_pieces.append(cell.text)
 
-        return " ".join(text_pieces)
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += cell.text
 
-    def get_text_in_rects(self, bboxes: List[BoundingBox]) -> List[str]:
-        """Extract text from multiple regions in a single pass for better performance."""
-        self._last_access_time = time.time()
-        if not self.valid or not bboxes:
-            return [""] * len(bboxes)
-            
-        page_size = self.get_size()
-        results = [""] * len(bboxes)
-        text_pieces = [[] for _ in range(len(bboxes))]
-        
-        # Ensure all bboxes are in top-left origin
-        normalized_bboxes = []
-        for bbox in bboxes:
-            if bbox.coord_origin != CoordOrigin.TOPLEFT:
-                normalized_bboxes.append(bbox.to_top_left_origin(page_height=page_size.height))
-            else:
-                normalized_bboxes.append(bbox)
-        
-        # Process all cells once
-        for cell in self._get_textline_cells_top_left():
-            cell_bbox = cell.rect.to_bounding_box()
-            
-            # Check against all bboxes
-            for i, bbox in enumerate(normalized_bboxes):
-                overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
-                if overlap_frac > OVERLAP_THRESHOLD:
-                    text_pieces[i].append(cell.text)
-                    
-        # Join text pieces for each bbox
-        for i in range(len(bboxes)):
-            results[i] = " ".join(text_pieces[i])
-                
-        return results
+        return text_piece
 
     def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        self._last_access_time = time.time()
         return self._dpage
 
     def get_text_cells(self) -> Iterable[TextCell]:
-        """Return text cells in top-left origin coordinates."""
-        self._last_access_time = time.time()
-        if not self.valid:
-            return []
-            
-        return self._get_textline_cells_top_left()
+        page_size = self.get_size()
+
+        [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
+
+        # for cell in self._dpage.textline_cells:
+        #     rect = cell.rect
+        #
+        #     assert (
+        #         rect.to_bounding_box().l <= rect.to_bounding_box().r
+        #     ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
+        #     assert (
+        #         rect.to_bounding_box().t <= rect.to_bounding_box().b
+        #     ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
+
+        return self._dpage.textline_cells
 
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        """Get bounding boxes for bitmap images on the page."""
-        self._last_access_time = time.time()
-        if not self.valid:
-            return []
-            
-        page_height = self.get_size().height
-        
-        for img in self._dpage.bitmap_resources:
-            cropbox = img.rect.to_bounding_box().to_top_left_origin(page_height)
+        AREA_THRESHOLD = 0  # 32 * 32
+
+        images = self._dpage.bitmap_resources
+
+        for img in images:
+            cropbox = img.rect.to_bounding_box().to_top_left_origin(
+                self.get_size().height
+            )
 
             if cropbox.area() > AREA_THRESHOLD:
-                yield cropbox.scaled(scale=scale)
+                cropbox = cropbox.scaled(scale=scale)
+
+                yield cropbox
 
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
-        """Render the page as an image, optionally cropped to a specific region."""
-        self._last_access_time = time.time()
         page_size = self.get_size()
 
-        # Skip rendering if page is invalid
-        if not self.valid:
-            # Return blank image of appropriate size
-            width = round(page_size.width * scale)
-            height = round(page_size.height * scale)
-            return Image.new('RGB', (width, height), color='white')
-
         if not cropbox:
             cropbox = BoundingBox(
                 l=0,
@@ -189,174 +114,78 @@ class DoclingParseV4PageBackend(PdfPageBackend):
         with pypdfium2_lock:
             image = (
                 self._ppage.render(
-                    scale=scale * IMAGE_SCALE_FACTOR,
-                    rotation=0,
+                    scale=scale * 1.5,
+                    rotation=0,  # no additional rotation
                     crop=padbox.as_tuple(),
                 )
                 .to_pil()
                 .resize(
                     size=(round(cropbox.width * scale), round(cropbox.height * scale))
                 )
-            )
+            )  # We resize the image from 1.5x the given scale to make it sharper.
 
         return image
 
+    def get_size(self) -> Size:
+        with pypdfium2_lock:
+            return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
+
+        # TODO: Take width and height from docling-parse.
+        # return Size(
+        #    width=self._dpage.dimension.width,
+        #    height=self._dpage.dimension.height,
+        # )
+
     def unload(self):
-        """Clean up resources."""
         self._ppage = None
         self._dpage = None
-        self._page_size = None
-        self._textline_cells_top_left = None
 
 
 class DoclingParseV4DocumentBackend(PdfDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        """Initialize the document backend with error handling."""
         super().__init__(in_doc, path_or_stream)
-        self._page_cache: Dict[int, DoclingParseV4PageBackend] = {}
-        self._executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 
-        try:
-            with pypdfium2_lock:
-                self._pdoc = pdfium.PdfDocument(self.path_or_stream)
-                
-            self.parser = DoclingPdfParser(loglevel="fatal")
-            self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
-            
-            if self.dp_doc is None:
-                raise RuntimeError(f"Failed to load document with docling-parse v4")
-                
-        except Exception as e:
-            _log.error(f"Error initializing DoclingParseV4: {str(e)}")
+        with pypdfium2_lock:
+            self._pdoc = pdfium.PdfDocument(self.path_or_stream)
+        self.parser = DoclingPdfParser(loglevel="fatal")
+        self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
+        success = self.dp_doc is not None
+
+        if not success:
             raise RuntimeError(
-                f"docling-parse v4 could not load document {self.document_hash}: {str(e)}"
+                f"docling-parse v4 could not load document {self.document_hash}."
             )
 
     def page_count(self) -> int:
-        """Get the number of pages in the document with validation."""
+        # return len(self._pdoc)  # To be replaced with docling-parse API
+
         len_1 = len(self._pdoc)
         len_2 = self.dp_doc.number_of_pages()
 
         if len_1 != len_2:
-            _log.warning(f"Inconsistent number of pages: {len_1}!={len_2}")
+            _log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
 
         return len_2
 
-    def _manage_cache(self):
-        """Manage page cache size by removing least recently used pages."""
-        if len(self._page_cache) > PAGE_CACHE_SIZE:
-            # Sort pages by last access time
-            sorted_pages = sorted(
-                self._page_cache.items(), 
-                key=lambda item: item[1]._last_access_time
-            )
-            # Remove oldest pages
-            pages_to_remove = len(self._page_cache) - PAGE_CACHE_SIZE
-            for i in range(pages_to_remove):
-                page_no, page = sorted_pages[i]
-                page.unload()
-                del self._page_cache[page_no]
-
     def load_page(
         self, page_no: int, create_words: bool = True, create_textlines: bool = True
     ) -> DoclingParseV4PageBackend:
-        """Load a specific page with error handling and caching."""
-        # Check cache first
-        if page_no in self._page_cache:
-            self._page_cache[page_no]._last_access_time = time.time()
-            return self._page_cache[page_no]
-            
-        try:
-            with pypdfium2_lock:
-                page_backend = DoclingParseV4PageBackend(
-                    self.dp_doc.get_page(
-                        page_no + 1,
-                        create_words=create_words,
-                        create_textlines=create_textlines,
-                    ),
-                    self._pdoc[page_no],
-                )
-            
-            # Add to cache
-            self._page_cache[page_no] = page_backend
-            self._manage_cache()
-            
-            return page_backend
-        except Exception as e:
-            _log.error(f"Error loading page {page_no}: {str(e)}")
-            # Return an invalid page backend instead of raising an exception
-            with pypdfium2_lock:
-                return DoclingParseV4PageBackend(None, self._pdoc[page_no])
-
-    def load_pages_in_parallel(
-        self, page_numbers: List[int], create_words: bool = True, create_textlines: bool = True
-    ) -> List[DoclingParseV4PageBackend]:
-        """Load multiple pages in parallel for better performance."""
-        # Check which pages are already in cache
-        pages_to_load = []
-        results = [None] * len(page_numbers)
-        
-        for i, page_no in enumerate(page_numbers):
-            if page_no in self._page_cache:
-                self._page_cache[page_no]._last_access_time = time.time()
-                results[i] = self._page_cache[page_no]
-            else:
-                pages_to_load.append((i, page_no))
-        
-        if not pages_to_load:
-            return results
-            
-        # Define a function to load a single page
-        def load_single_page(idx_page_tuple):
-            idx, page_no = idx_page_tuple
-            try:
-                with pypdfium2_lock:
-                    page = self.dp_doc.get_page(
-                        page_no + 1,
-                        create_words=create_words,
-                        create_textlines=create_textlines,
-                    )
-                    ppage = self._pdoc[page_no]
-                    
-                page_backend = DoclingParseV4PageBackend(page, ppage)
-                return idx, page_no, page_backend
-            except Exception as e:
-                _log.error(f"Error loading page {page_no} in parallel: {str(e)}")
-                with pypdfium2_lock:
-                    return idx, page_no, DoclingParseV4PageBackend(None, self._pdoc[page_no])
-        
-        # Load pages in parallel
-        for idx, page_no, page_backend in self._executor.map(load_single_page, pages_to_load):
-            results[idx] = page_backend
-            self._page_cache[page_no] = page_backend
-        
-        self._manage_cache()
-        return results
+        with pypdfium2_lock:
+            return DoclingParseV4PageBackend(
+                self.dp_doc.get_page(
+                    page_no + 1,
+                    create_words=create_words,
+                    create_textlines=create_textlines,
+                ),
+                self._pdoc[page_no],
+            )
 
     def is_valid(self) -> bool:
-        """Check if the document is valid."""
-        return self.dp_doc is not None and self.page_count() > 0
+        return self.page_count() > 0
 
     def unload(self):
-        """Clean up resources properly."""
         super().unload()
-        
-        # Unload all cached pages
-        for page in self._page_cache.values():
-            page.unload()
-        self._page_cache.clear()
-        
-        # Shutdown executor
-        if hasattr(self, '_executor'):
-            self._executor.shutdown(wait=False)
-        
-        if hasattr(self, 'dp_doc') and self.dp_doc is not None:
-            self.dp_doc.unload()
-        
-        if hasattr(self, '_pdoc') and self._pdoc is not None:
-            with pypdfium2_lock:
-                self._pdoc.close()
-                
+        self.dp_doc.unload()
+        with pypdfium2_lock:
+            self._pdoc.close()
         self._pdoc = None
-        self.dp_doc = None
-        self.parser = None