diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py new file mode 100644 index 00000000..690d139d --- /dev/null +++ b/docling/backend/docling_parse_backend.py @@ -0,0 +1,234 @@ +import logging +import random +from io import BytesIO +from pathlib import Path +from typing import Iterable, List, Optional, Union + +import pypdfium2 as pdfium +from docling_core.types.doc import BoundingBox, CoordOrigin, Size +from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell +from docling_parse.pdf_parsers import pdf_parser_v1 +from PIL import Image, ImageDraw +from pypdfium2 import PdfPage + +from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class DoclingParsePageBackend(PdfPageBackend): + def __init__( + self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage + ): + self._ppage = page_obj + parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) + + self.valid = "pages" in parsed_page + if self.valid: + self._dpage = parsed_page["pages"][0] + else: + _log.info( + f"An error occurred when loading page {page_no} of document {document_hash}." + ) + + def is_valid(self) -> bool: + return self.valid + + def get_text_in_rect(self, bbox: BoundingBox) -> str: + if not self.valid: + return "" + # Find intersecting cells on the page + text_piece = "" + page_size = self.get_size() + parser_width = self._dpage["width"] + parser_height = self._dpage["height"] + + scale = ( + 1 # FIX - Replace with param in get_text_in_rect across backends (optional) + ) + + for i in range(len(self._dpage["cells"])): + rect = self._dpage["cells"][i]["box"]["device"] + x0, y0, x1, y1 = rect + cell_bbox = BoundingBox( + l=x0 * scale * page_size.width / parser_width, + b=y0 * scale * page_size.height / parser_height, + r=x1 * scale * page_size.width / parser_width, + t=y1 * scale * page_size.height / parser_height, + coord_origin=CoordOrigin.BOTTOMLEFT, + ).to_top_left_origin(page_height=page_size.height * scale) + + overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area() + + if overlap_frac > 0.5: + if len(text_piece) > 0: + text_piece += " " + text_piece += self._dpage["cells"][i]["content"]["rnormalized"] + + return text_piece + + def get_segmented_page(self) -> Optional[SegmentedPdfPage]: + return None + + def get_text_cells(self) -> Iterable[TextCell]: + cells: List[TextCell] = [] + cell_counter = 0 + + if not self.valid: + return cells + + page_size = self.get_size() + + parser_width = self._dpage["width"] + parser_height = self._dpage["height"] + + for i in range(len(self._dpage["cells"])): + rect = self._dpage["cells"][i]["box"]["device"] + x0, y0, x1, y1 = rect + + if x1 < x0: + x0, x1 = x1, x0 + if y1 < y0: + y0, y1 = y1, y0 + + text_piece = self._dpage["cells"][i]["content"]["rnormalized"] + cells.append( + TextCell( + index=cell_counter, + text=text_piece, + orig=text_piece, + rect=BoundingRectangle.from_bounding_box( + BoundingBox( + # l=x0, b=y0, r=x1, t=y1, + l=x0 * page_size.width / parser_width, + b=y0 * page_size.height / parser_height, + r=x1 * page_size.width / parser_width, + t=y1 * page_size.height / parser_height, + coord_origin=CoordOrigin.BOTTOMLEFT, + ) + ), + ).to_top_left_origin(page_size.height), + ) + + cell_counter += 1 + + def draw_clusters_and_cells(): + image = ( + self.get_page_image() + ) # make new image to avoid drawing on the saved ones + draw = ImageDraw.Draw(image) + for c in cells: + x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple() + cell_color = ( + random.randint(30, 140), + random.randint(30, 140), + random.randint(30, 140), + ) + draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) + image.show() + + # before merge: + # draw_clusters_and_cells() + + # cells = merge_horizontal_cells(cells) + + # after merge: + # draw_clusters_and_cells() + + return cells + + def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: + AREA_THRESHOLD = 0 # 32 * 32 + + for i in range(len(self._dpage["images"])): + bitmap = self._dpage["images"][i] + cropbox = BoundingBox.from_tuple( + bitmap["box"], origin=CoordOrigin.BOTTOMLEFT + ).to_top_left_origin(self.get_size().height) + + if cropbox.area() > AREA_THRESHOLD: + cropbox = cropbox.scaled(scale=scale) + + yield cropbox + + def get_page_image( + self, scale: float = 1, cropbox: Optional[BoundingBox] = None + ) -> Image.Image: + + page_size = self.get_size() + + if not cropbox: + cropbox = BoundingBox( + l=0, + r=page_size.width, + t=0, + b=page_size.height, + coord_origin=CoordOrigin.TOPLEFT, + ) + padbox = BoundingBox( + l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT + ) + else: + padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy() + padbox.r = page_size.width - padbox.r + padbox.t = page_size.height - padbox.t + + image = ( + self._ppage.render( + scale=scale * 1.5, + rotation=0, # no additional rotation + crop=padbox.as_tuple(), + ) + .to_pil() + .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) + ) # We resize the image from 1.5x the given scale to make it sharper. + + return image + + def get_size(self) -> Size: + return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) + + def unload(self): + self._ppage = None + self._dpage = None + + +class DoclingParseDocumentBackend(PdfDocumentBackend): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + self._pdoc = pdfium.PdfDocument(self.path_or_stream) + self.parser = pdf_parser_v1() + + success = False + if isinstance(self.path_or_stream, BytesIO): + success = self.parser.load_document_from_bytesio( + self.document_hash, self.path_or_stream + ) + elif isinstance(self.path_or_stream, Path): + success = self.parser.load_document( + self.document_hash, str(self.path_or_stream) + ) + + if not success: + raise RuntimeError( + f"docling-parse could not load document with hash {self.document_hash}." + ) + + def page_count(self) -> int: + return len(self._pdoc) # To be replaced with docling-parse API + + def load_page(self, page_no: int) -> DoclingParsePageBackend: + return DoclingParsePageBackend( + self.parser, self.document_hash, page_no, self._pdoc[page_no] + ) + + def is_valid(self) -> bool: + return self.page_count() > 0 + + def unload(self): + super().unload() + self.parser.unload_document(self.document_hash) + self._pdoc.close() + self._pdoc = None diff --git a/docling/backend/docling_parse_v3_backend.py b/docling/backend/docling_parse_v3_backend.py index ffe64911..351dd31d 100644 --- a/docling/backend/docling_parse_v3_backend.py +++ b/docling/backend/docling_parse_v3_backend.py @@ -65,9 +65,9 @@ class DoclingParseV3PageBackend(PdfPageBackend): for cell in self._dpage.textline_cells: rect = cell.rect - if rect.r_x2 < rect.r_x0: - rect.r_x0, rect.r_x2 = rect.r_x2, rect.r_x0 - rect.r_y3, rect.r_y1 = rect.r_y1, rect.r_y3 + # if rect.r_x2 < rect.r_x0: + # rect.r_x0, rect.r_x2 = rect.r_x2, rect.r_x0 + # rect.r_y3, rect.r_y1 = rect.r_y1, rect.r_y3 # rect.r_x2, rect.r_x3 = rect.r_x3, rect.r_x2 diff --git a/docling/cli/main.py b/docling/cli/main.py index 831c1c35..1caad6c6 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -412,7 +412,9 @@ def convert( if artifacts_path is not None: pipeline_options.artifacts_path = artifacts_path - if pdf_backend == PdfBackend.DLPARSE_V2: + if pdf_backend == PdfBackend.DLPARSE_V1: + backend = DoclingParseV2DocumentBackend + elif pdf_backend == PdfBackend.DLPARSE_V2: backend = DoclingParseV2DocumentBackend elif pdf_backend == PdfBackend.DLPARSE_V3: backend = DoclingParseV3DocumentBackend # type: ignore diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 1162aaca..b267d983 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -299,6 +299,7 @@ class PdfBackend(str, Enum): """Enum of valid PDF backends.""" PYPDFIUM2 = "pypdfium2" + DLPARSE_V1 = "dlparse_v1" DLPARSE_V2 = "dlparse_v2" DLPARSE_V3 = "dlparse_v3" @@ -381,3 +382,5 @@ class PdfPipelineOptions(PaginatedPipelineOptions): "before conversion and then use the `TableItem.get_image` function." ), ) + + generate_parsed_pages: bool = False diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 2ac52be1..9dc291a9 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder class PagePreprocessingOptions(BaseModel): images_scale: Optional[float] + create_parsed_page: bool class PagePreprocessingModel(BasePageModel): @@ -54,7 +55,9 @@ class PagePreprocessingModel(BasePageModel): assert page._backend is not None page.cells = list(page._backend.get_text_cells()) - page.parsed_page = page._backend.get_segmented_page() + + if self.options.create_parsed_page: + page.parsed_page = page._backend.get_segmented_page() # DEBUG code: def draw_text_boxes(image, cells, show: bool = False): diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 7df8f15b..a56b84b5 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -87,7 +87,8 @@ class StandardPdfPipeline(PaginatedPipeline): # Pre-processing PagePreprocessingModel( options=PagePreprocessingOptions( - images_scale=pipeline_options.images_scale + images_scale=pipeline_options.images_scale, + create_parsed_page=pipeline_options.generate_parsed_pages, ) ), # OCR diff --git a/poetry.lock b/poetry.lock index 1fd68f66..329f46da 100644 --- a/poetry.lock +++ b/poetry.lock @@ -898,7 +898,7 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"] type = "git" url = "https://github.com/DS4SD/docling-core" reference = "cau/docling-parse-types" -resolved_reference = "a2f1fccf80324e74c1ed66574bfa2bc02163e2ae" +resolved_reference = "7cb80880a4781e781cf797d42bda34498cf81184" [[package]] name = "docling-ibm-models" diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py new file mode 100644 index 00000000..3c214791 --- /dev/null +++ b/tests/test_backend_docling_parse.py @@ -0,0 +1,77 @@ +from pathlib import Path + +import pytest +from docling_core.types.doc import BoundingBox + +from docling.backend.docling_parse_backend import ( + DoclingParseDocumentBackend, + DoclingParsePageBackend, +) +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + + +@pytest.fixture +def test_doc_path(): + return Path("./tests/data/pdf/2206.01062.pdf") + + +def _get_backend(pdf_doc): + in_doc = InputDocument( + path_or_stream=pdf_doc, + format=InputFormat.PDF, + backend=DoclingParseDocumentBackend, + ) + + doc_backend = in_doc._backend + return doc_backend + + +def test_text_cell_counts(): + pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") + + doc_backend = _get_backend(pdf_doc) + + for page_index in range(0, doc_backend.page_count()): + last_cell_count = None + for i in range(10): + page_backend: DoclingParsePageBackend = doc_backend.load_page(0) + cells = list(page_backend.get_text_cells()) + + if last_cell_count is None: + last_cell_count = len(cells) + + if len(cells) != last_cell_count: + assert ( + False + ), "Loading page multiple times yielded non-identical text cell counts" + last_cell_count = len(cells) + + +def test_get_text_from_rect(test_doc_path): + doc_backend = _get_backend(test_doc_path) + page_backend: DoclingParsePageBackend = doc_backend.load_page(0) + + # Get the title text of the DocLayNet paper + textpiece = page_backend.get_text_in_rect( + bbox=BoundingBox(l=102, t=77, r=511, b=124) + ) + ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" + + assert textpiece.strip() == ref + + +def test_crop_page_image(test_doc_path): + doc_backend = _get_backend(test_doc_path) + page_backend: DoclingParsePageBackend = doc_backend.load_page(0) + + # Crop out "Figure 1" from the DocLayNet paper + im = page_backend.get_page_image( + scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) + ) + # im.show() + + +def test_num_pages(test_doc_path): + doc_backend = _get_backend(test_doc_path) + doc_backend.page_count() == 9