feat: Make Page.parsed_page the only source of truth for text cells, add OCR cells to it (#1745)

* Keep page.parsed_page.textline_cells and page.cells in sync, including OCR Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Make page.parsed_page the only source of truth for text cells Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Small fix Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Correctly compute PDF boxes from pymupdf Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use different OCR engine order Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add type hints and fix mypy Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * One more test fix Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Remove with pypdfium2_lock from caller sites Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix typing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2025-06-13 19:01:55 +02:00
parent 0432a31b2f
commit 7d3302cb48
50 changed files with 339091 additions and 330047 deletions
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -57,14 +57,14 @@ def test_e2e_conversions():
    pdf_paths = get_pdf_paths()

    engines: List[Tuple[OcrOptions, bool]] = [
-        (EasyOcrOptions(), False),
        (TesseractOcrOptions(), True),
        (TesseractCliOcrOptions(), True),
-        (EasyOcrOptions(force_full_page_ocr=True), False),
+        (EasyOcrOptions(), False),
        (TesseractOcrOptions(force_full_page_ocr=True), True),
        (TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
        (TesseractCliOcrOptions(force_full_page_ocr=True), True),
        (TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
+        (EasyOcrOptions(force_full_page_ocr=True), False),
    ]

    # rapidocr is only available for Python >=3.6,<3.13