fix(pypdfium2): Fix OCR bounding box misalignment caused by mismatched rotation metadata (#2039)

* Fix OCR bounding box misalignment caused by rotation metadata

Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com>

* Add rotation-mismatch scanned pdf test case

Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com>

* add ground truth for ocr_test_rotation_mismatch.pdf

Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com>

* add ground truth for ocr_test_rotation_mismatch.pdf

Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com>

* Updated test GT and merged from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix OCR test by excluding mismatched rotation example

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
AndrewTsai0406
2025-09-01 23:22:43 +08:00
committed by GitHub
parent 9f4bc5b2f1
commit 4d94e38223
4 changed files with 45 additions and 3 deletions

View File

@@ -254,16 +254,38 @@ class PyPdfiumPageBackend(PdfPageBackend):
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32
page_size = self.get_size()
rotation = self._ppage.get_rotation()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
if rotation == 90:
pos = (
pos[1],
page_size.height - pos[2],
pos[3],
page_size.height - pos[0],
)
elif rotation == 180:
pos = (
page_size.width - pos[2],
page_size.height - pos[3],
page_size.width - pos[0],
page_size.height - pos[1],
)
elif rotation == 270:
pos = (
page_size.width - pos[3],
pos[0],
page_size.width - pos[1],
pos[2],
)
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_text_in_rect(self, bbox: BoundingBox) -> str: