Merge from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-06-30 14:49:33 +02:00 · 2025-06-30 14:49:33 +02:00 · 678eed2057
commit 678eed2057
parent 92eb1517b6 bdfee4e2d0
3 changed files with 41 additions and 8 deletions
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
    def unload(self):
        super().unload()
-        self.dp_doc.unload()
+        # Unload docling-parse document first
-        with pypdfium2_lock:
+        if self.dp_doc is not None:
-            self._pdoc.close()
+            self.dp_doc.unload()
-        self._pdoc = None
+            self.dp_doc = None
        # Then close pypdfium2 document with proper locking
        if self._pdoc is not None:
            with pypdfium2_lock:
                try:
                    self._pdoc.close()
                except Exception:
                    # Ignore cleanup errors
                    pass
            self._pdoc = None
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -144,7 +144,10 @@ class TesseractOcrModel(BaseOcrModel):
                        local_reader = self.reader
                        self.osd_reader.SetImage(high_res_image)
                        doc_orientation = 0
                        osd = self.osd_reader.DetectOrientationScript()
                        # No text, or Orientation and Script detection failure
                        if osd is None:
                            _log.error(
@ -158,11 +161,14 @@ class TesseractOcrModel(BaseOcrModel):
                            # to OCR in the hope OCR will succeed while OSD failed
                            if self._is_auto:
                                continue
-                        doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
+                        else:
-                        if doc_orientation != 0:
+                            doc_orientation = parse_tesseract_orientation(
-                            high_res_image = high_res_image.rotate(
+                                osd["orient_deg"]
                                -doc_orientation, expand=True
                            )
                            if doc_orientation != 0:
                                high_res_image = high_res_image.rotate(
                                    -doc_orientation, expand=True
                                )
                        if self._is_auto:
                            script = osd["script_name"]
                            script = map_tesseract_script(script)
--- a/tests/test_backend_docling_parse_v4.py
+++ b/tests/test_backend_docling_parse_v4.py
@ -46,6 +46,12 @@ def test_text_cell_counts():
                )
            last_cell_count = len(cells)
            # Clean up page backend after each iteration
            page_backend.unload()
    # Explicitly clean up document backend to prevent race conditions in CI
    doc_backend.unload()
 def test_get_text_from_rect(test_doc_path):
    doc_backend = _get_backend(test_doc_path)
@ -59,6 +65,10 @@ def test_get_text_from_rect(test_doc_path):
    assert textpiece.strip() == ref
    # Explicitly clean up resources
    page_backend.unload()
    doc_backend.unload()
 def test_crop_page_image(test_doc_path):
    doc_backend = _get_backend(test_doc_path)
@ -70,7 +80,14 @@ def test_crop_page_image(test_doc_path):
    )
    # im.show()
    # Explicitly clean up resources
    page_backend.unload()
    doc_backend.unload()
 def test_num_pages(test_doc_path):
    doc_backend = _get_backend(test_doc_path)
    doc_backend.page_count() == 9
    # Explicitly clean up resources to prevent race conditions in CI
    doc_backend.unload()