mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix(pypdfium2): Fix OCR bounding box misalignment caused by mismatched rotation metadata (#2039)
* Fix OCR bounding box misalignment caused by rotation metadata Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com> * Add rotation-mismatch scanned pdf test case Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com> * add ground truth for ocr_test_rotation_mismatch.pdf Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com> * add ground truth for ocr_test_rotation_mismatch.pdf Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com> * Updated test GT and merged from main Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix OCR test by excluding mismatched rotation example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: AndrewTsai0406 <tsai247365@gmail.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -254,16 +254,38 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
AREA_THRESHOLD = 0 # 32 * 32
|
AREA_THRESHOLD = 0 # 32 * 32
|
||||||
page_size = self.get_size()
|
page_size = self.get_size()
|
||||||
|
rotation = self._ppage.get_rotation()
|
||||||
|
|
||||||
with pypdfium2_lock:
|
with pypdfium2_lock:
|
||||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||||
pos = obj.get_pos()
|
pos = obj.get_pos()
|
||||||
|
if rotation == 90:
|
||||||
|
pos = (
|
||||||
|
pos[1],
|
||||||
|
page_size.height - pos[2],
|
||||||
|
pos[3],
|
||||||
|
page_size.height - pos[0],
|
||||||
|
)
|
||||||
|
elif rotation == 180:
|
||||||
|
pos = (
|
||||||
|
page_size.width - pos[2],
|
||||||
|
page_size.height - pos[3],
|
||||||
|
page_size.width - pos[0],
|
||||||
|
page_size.height - pos[1],
|
||||||
|
)
|
||||||
|
elif rotation == 270:
|
||||||
|
pos = (
|
||||||
|
page_size.width - pos[3],
|
||||||
|
pos[0],
|
||||||
|
page_size.width - pos[1],
|
||||||
|
pos[2],
|
||||||
|
)
|
||||||
|
|
||||||
cropbox = BoundingBox.from_tuple(
|
cropbox = BoundingBox.from_tuple(
|
||||||
pos, origin=CoordOrigin.BOTTOMLEFT
|
pos, origin=CoordOrigin.BOTTOMLEFT
|
||||||
).to_top_left_origin(page_height=page_size.height)
|
).to_top_left_origin(page_height=page_size.height)
|
||||||
|
|
||||||
if cropbox.area() > AREA_THRESHOLD:
|
if cropbox.area() > AREA_THRESHOLD:
|
||||||
cropbox = cropbox.scaled(scale=scale)
|
cropbox = cropbox.scaled(scale=scale)
|
||||||
|
|
||||||
yield cropbox
|
yield cropbox
|
||||||
|
|
||||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||||
|
|||||||
BIN
tests/data_scanned/sample_with_rotation_mismatch.pdf
vendored
Normal file
BIN
tests/data_scanned/sample_with_rotation_mismatch.pdf
vendored
Normal file
Binary file not shown.
@@ -9,6 +9,8 @@ from docling.backend.pypdfium2_backend import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@@ -27,6 +29,23 @@ def _get_backend(pdf_doc):
|
|||||||
return doc_backend
|
return doc_backend
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_text_from_rect_rotated():
|
||||||
|
pdf_doc = Path("./tests/data_scanned/sample_with_rotation_mismatch.pdf")
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.do_ocr = True
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
conv_res = doc_converter.convert(pdf_doc)
|
||||||
|
|
||||||
|
assert "1972" in conv_res.document.export_to_markdown()
|
||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
def test_text_cell_counts():
|
||||||
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,8 @@ def get_pdf_paths():
|
|||||||
directory = Path("./tests/data_scanned")
|
directory = Path("./tests/data_scanned")
|
||||||
|
|
||||||
# List all PDF files in the directory and its subdirectories
|
# List all PDF files in the directory and its subdirectories
|
||||||
pdf_files = sorted(directory.rglob("*.pdf"))
|
pdf_files = sorted(directory.rglob("ocr_test*.pdf"))
|
||||||
|
|
||||||
return pdf_files
|
return pdf_files
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user