diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index a64a7149..1d504f4c 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -254,16 +254,38 @@ class PyPdfiumPageBackend(PdfPageBackend): def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: AREA_THRESHOLD = 0 # 32 * 32 page_size = self.get_size() + rotation = self._ppage.get_rotation() + with pypdfium2_lock: for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]): pos = obj.get_pos() + if rotation == 90: + pos = ( + pos[1], + page_size.height - pos[2], + pos[3], + page_size.height - pos[0], + ) + elif rotation == 180: + pos = ( + page_size.width - pos[2], + page_size.height - pos[3], + page_size.width - pos[0], + page_size.height - pos[1], + ) + elif rotation == 270: + pos = ( + page_size.width - pos[3], + pos[0], + page_size.width - pos[1], + pos[2], + ) + cropbox = BoundingBox.from_tuple( pos, origin=CoordOrigin.BOTTOMLEFT ).to_top_left_origin(page_height=page_size.height) - if cropbox.area() > AREA_THRESHOLD: cropbox = cropbox.scaled(scale=scale) - yield cropbox def get_text_in_rect(self, bbox: BoundingBox) -> str: diff --git a/tests/data_scanned/sample_with_rotation_mismatch.pdf b/tests/data_scanned/sample_with_rotation_mismatch.pdf new file mode 100644 index 00000000..be70e48b Binary files /dev/null and b/tests/data_scanned/sample_with_rotation_mismatch.pdf differ diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py index 917f72d0..f2cd9db9 100644 --- a/tests/test_backend_pdfium.py +++ b/tests/test_backend_pdfium.py @@ -9,6 +9,8 @@ from docling.backend.pypdfium2_backend import ( ) from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption @pytest.fixture @@ -27,6 +29,23 @@ def _get_backend(pdf_doc): return doc_backend +def test_get_text_from_rect_rotated(): + pdf_doc = Path("./tests/data_scanned/sample_with_rotation_mismatch.pdf") + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = True + + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend + ) + } + ) + conv_res = doc_converter.convert(pdf_doc) + + assert "1972" in conv_res.document.export_to_markdown() + + def test_text_cell_counts(): pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 77e6fac1..e245467b 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -31,7 +31,8 @@ def get_pdf_paths(): directory = Path("./tests/data_scanned") # List all PDF files in the directory and its subdirectories - pdf_files = sorted(directory.rglob("*.pdf")) + pdf_files = sorted(directory.rglob("ocr_test*.pdf")) + return pdf_files