Merge from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-30 14:49:33 +02:00
commit 678eed2057
3 changed files with 41 additions and 8 deletions

View File

@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
def unload(self):
super().unload()
self.dp_doc.unload()
with pypdfium2_lock:
self._pdoc.close()
self._pdoc = None
# Unload docling-parse document first
if self.dp_doc is not None:
self.dp_doc.unload()
self.dp_doc = None
# Then close pypdfium2 document with proper locking
if self._pdoc is not None:
with pypdfium2_lock:
try:
self._pdoc.close()
except Exception:
# Ignore cleanup errors
pass
self._pdoc = None

View File

@ -144,7 +144,10 @@ class TesseractOcrModel(BaseOcrModel):
local_reader = self.reader
self.osd_reader.SetImage(high_res_image)
doc_orientation = 0
osd = self.osd_reader.DetectOrientationScript()
# No text, or Orientation and Script detection failure
if osd is None:
_log.error(
@ -158,11 +161,14 @@ class TesseractOcrModel(BaseOcrModel):
# to OCR in the hope OCR will succeed while OSD failed
if self._is_auto:
continue
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
-doc_orientation, expand=True
else:
doc_orientation = parse_tesseract_orientation(
osd["orient_deg"]
)
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
-doc_orientation, expand=True
)
if self._is_auto:
script = osd["script_name"]
script = map_tesseract_script(script)

View File

@ -46,6 +46,12 @@ def test_text_cell_counts():
)
last_cell_count = len(cells)
# Clean up page backend after each iteration
page_backend.unload()
# Explicitly clean up document backend to prevent race conditions in CI
doc_backend.unload()
def test_get_text_from_rect(test_doc_path):
doc_backend = _get_backend(test_doc_path)
@ -59,6 +65,10 @@ def test_get_text_from_rect(test_doc_path):
assert textpiece.strip() == ref
# Explicitly clean up resources
page_backend.unload()
doc_backend.unload()
def test_crop_page_image(test_doc_path):
doc_backend = _get_backend(test_doc_path)
@ -70,7 +80,14 @@ def test_crop_page_image(test_doc_path):
)
# im.show()
# Explicitly clean up resources
page_backend.unload()
doc_backend.unload()
def test_num_pages(test_doc_path):
doc_backend = _get_backend(test_doc_path)
doc_backend.page_count() == 9
# Explicitly clean up resources to prevent race conditions in CI
doc_backend.unload()