mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
fix(pypdfium2): Fix OCR bounding box misalignment caused by mismatched rotation metadata (#2039)
* Fix OCR bounding box misalignment caused by rotation metadata Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com> * Add rotation-mismatch scanned pdf test case Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com> * add ground truth for ocr_test_rotation_mismatch.pdf Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com> * add ground truth for ocr_test_rotation_mismatch.pdf Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com> * Updated test GT and merged from main Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix OCR test by excluding mismatched rotation example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -254,16 +254,38 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
page_size = self.get_size()
|
||||
rotation = self._ppage.get_rotation()
|
||||
|
||||
with pypdfium2_lock:
|
||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||
pos = obj.get_pos()
|
||||
if rotation == 90:
|
||||
pos = (
|
||||
pos[1],
|
||||
page_size.height - pos[2],
|
||||
pos[3],
|
||||
page_size.height - pos[0],
|
||||
)
|
||||
elif rotation == 180:
|
||||
pos = (
|
||||
page_size.width - pos[2],
|
||||
page_size.height - pos[3],
|
||||
page_size.width - pos[0],
|
||||
page_size.height - pos[1],
|
||||
)
|
||||
elif rotation == 270:
|
||||
pos = (
|
||||
page_size.width - pos[3],
|
||||
pos[0],
|
||||
page_size.width - pos[1],
|
||||
pos[2],
|
||||
)
|
||||
|
||||
cropbox = BoundingBox.from_tuple(
|
||||
pos, origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_height=page_size.height)
|
||||
|
||||
if cropbox.area() > AREA_THRESHOLD:
|
||||
cropbox = cropbox.scaled(scale=scale)
|
||||
|
||||
yield cropbox
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
|
||||
Reference in New Issue
Block a user