Merge from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-30 14:49:33 +02:00
commit 678eed2057
3 changed files with 41 additions and 8 deletions

View File

@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
def unload(self): def unload(self):
super().unload() super().unload()
self.dp_doc.unload() # Unload docling-parse document first
with pypdfium2_lock: if self.dp_doc is not None:
self._pdoc.close() self.dp_doc.unload()
self._pdoc = None self.dp_doc = None
# Then close pypdfium2 document with proper locking
if self._pdoc is not None:
with pypdfium2_lock:
try:
self._pdoc.close()
except Exception:
# Ignore cleanup errors
pass
self._pdoc = None

View File

@ -144,7 +144,10 @@ class TesseractOcrModel(BaseOcrModel):
local_reader = self.reader local_reader = self.reader
self.osd_reader.SetImage(high_res_image) self.osd_reader.SetImage(high_res_image)
doc_orientation = 0
osd = self.osd_reader.DetectOrientationScript() osd = self.osd_reader.DetectOrientationScript()
# No text, or Orientation and Script detection failure # No text, or Orientation and Script detection failure
if osd is None: if osd is None:
_log.error( _log.error(
@ -158,11 +161,14 @@ class TesseractOcrModel(BaseOcrModel):
# to OCR in the hope OCR will succeed while OSD failed # to OCR in the hope OCR will succeed while OSD failed
if self._is_auto: if self._is_auto:
continue continue
doc_orientation = parse_tesseract_orientation(osd["orient_deg"]) else:
if doc_orientation != 0: doc_orientation = parse_tesseract_orientation(
high_res_image = high_res_image.rotate( osd["orient_deg"]
-doc_orientation, expand=True
) )
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
-doc_orientation, expand=True
)
if self._is_auto: if self._is_auto:
script = osd["script_name"] script = osd["script_name"]
script = map_tesseract_script(script) script = map_tesseract_script(script)

View File

@ -46,6 +46,12 @@ def test_text_cell_counts():
) )
last_cell_count = len(cells) last_cell_count = len(cells)
# Clean up page backend after each iteration
page_backend.unload()
# Explicitly clean up document backend to prevent race conditions in CI
doc_backend.unload()
def test_get_text_from_rect(test_doc_path): def test_get_text_from_rect(test_doc_path):
doc_backend = _get_backend(test_doc_path) doc_backend = _get_backend(test_doc_path)
@ -59,6 +65,10 @@ def test_get_text_from_rect(test_doc_path):
assert textpiece.strip() == ref assert textpiece.strip() == ref
# Explicitly clean up resources
page_backend.unload()
doc_backend.unload()
def test_crop_page_image(test_doc_path): def test_crop_page_image(test_doc_path):
doc_backend = _get_backend(test_doc_path) doc_backend = _get_backend(test_doc_path)
@ -70,7 +80,14 @@ def test_crop_page_image(test_doc_path):
) )
# im.show() # im.show()
# Explicitly clean up resources
page_backend.unload()
doc_backend.unload()
def test_num_pages(test_doc_path): def test_num_pages(test_doc_path):
doc_backend = _get_backend(test_doc_path) doc_backend = _get_backend(test_doc_path)
doc_backend.page_count() == 9 doc_backend.page_count() == 9
# Explicitly clean up resources to prevent race conditions in CI
doc_backend.unload()