fix(pypdfium2): Fix OCR bounding box misalignment caused by mismatched rotation metadata (#2039)

* Fix OCR bounding box misalignment caused by rotation metadata

Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com>

* Add rotation-mismatch scanned pdf test case

Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com>

* add ground truth for ocr_test_rotation_mismatch.pdf

Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com>

* add ground truth for ocr_test_rotation_mismatch.pdf

Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com>

* Updated test GT and merged from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix OCR test by excluding mismatched rotation example

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
AndrewTsai0406
2025-09-01 23:22:43 +08:00
committed by GitHub
parent 9f4bc5b2f1
commit 4d94e38223
4 changed files with 45 additions and 3 deletions

Binary file not shown.

View File

@@ -9,6 +9,8 @@ from docling.backend.pypdfium2_backend import (
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture
@@ -27,6 +29,23 @@ def _get_backend(pdf_doc):
return doc_backend
def test_get_text_from_rect_rotated():
pdf_doc = Path("./tests/data_scanned/sample_with_rotation_mismatch.pdf")
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
)
}
)
conv_res = doc_converter.convert(pdf_doc)
assert "1972" in conv_res.document.export_to_markdown()
def test_text_cell_counts():
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")

View File

@@ -31,7 +31,8 @@ def get_pdf_paths():
directory = Path("./tests/data_scanned")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf"))
pdf_files = sorted(directory.rglob("ocr_test*.pdf"))
return pdf_files