diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py index cac07f80..5004563f 100644 --- a/docling/backend/docling_parse_v4_backend.py +++ b/docling/backend/docling_parse_v4_backend.py @@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend): def unload(self): super().unload() - self.dp_doc.unload() - with pypdfium2_lock: - self._pdoc.close() - self._pdoc = None + # Unload docling-parse document first + if self.dp_doc is not None: + self.dp_doc.unload() + self.dp_doc = None + + # Then close pypdfium2 document with proper locking + if self._pdoc is not None: + with pypdfium2_lock: + try: + self._pdoc.close() + except Exception: + # Ignore cleanup errors + pass + self._pdoc = None diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 0d520877..ed6306ba 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -144,7 +144,10 @@ class TesseractOcrModel(BaseOcrModel): local_reader = self.reader self.osd_reader.SetImage(high_res_image) + + doc_orientation = 0 osd = self.osd_reader.DetectOrientationScript() + # No text, or Orientation and Script detection failure if osd is None: _log.error( @@ -158,11 +161,14 @@ class TesseractOcrModel(BaseOcrModel): # to OCR in the hope OCR will succeed while OSD failed if self._is_auto: continue - doc_orientation = parse_tesseract_orientation(osd["orient_deg"]) - if doc_orientation != 0: - high_res_image = high_res_image.rotate( - -doc_orientation, expand=True + else: + doc_orientation = parse_tesseract_orientation( + osd["orient_deg"] ) + if doc_orientation != 0: + high_res_image = high_res_image.rotate( + -doc_orientation, expand=True + ) if self._is_auto: script = osd["script_name"] script = map_tesseract_script(script) diff --git a/tests/test_backend_docling_parse_v4.py b/tests/test_backend_docling_parse_v4.py index 35c4eab7..7e9dcda1 100644 --- a/tests/test_backend_docling_parse_v4.py +++ b/tests/test_backend_docling_parse_v4.py @@ -46,6 +46,12 @@ def test_text_cell_counts(): ) last_cell_count = len(cells) + # Clean up page backend after each iteration + page_backend.unload() + + # Explicitly clean up document backend to prevent race conditions in CI + doc_backend.unload() + def test_get_text_from_rect(test_doc_path): doc_backend = _get_backend(test_doc_path) @@ -59,6 +65,10 @@ def test_get_text_from_rect(test_doc_path): assert textpiece.strip() == ref + # Explicitly clean up resources + page_backend.unload() + doc_backend.unload() + def test_crop_page_image(test_doc_path): doc_backend = _get_backend(test_doc_path) @@ -70,7 +80,14 @@ def test_crop_page_image(test_doc_path): ) # im.show() + # Explicitly clean up resources + page_backend.unload() + doc_backend.unload() + def test_num_pages(test_doc_path): doc_backend = _get_backend(test_doc_path) doc_backend.page_count() == 9 + + # Explicitly clean up resources to prevent race conditions in CI + doc_backend.unload()