From 3f0e98b1ad5a3bc4d4ed46681f13150dcc410667 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Thu, 6 Feb 2025 20:29:44 +0100 Subject: [PATCH] Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer --- docling/backend/docling_parse_v3_backend.py | 198 ++++++++++++++++++++ docs/examples/batch_convert.py | 22 ++- tests/test_backend_docling_parse_v3.py | 76 ++++++++ 3 files changed, 288 insertions(+), 8 deletions(-) create mode 100644 docling/backend/docling_parse_v3_backend.py create mode 100644 tests/test_backend_docling_parse_v3.py diff --git a/docling/backend/docling_parse_v3_backend.py b/docling/backend/docling_parse_v3_backend.py new file mode 100644 index 00000000..9b6711c6 --- /dev/null +++ b/docling/backend/docling_parse_v3_backend.py @@ -0,0 +1,198 @@ +import logging +import random +from io import BytesIO +from pathlib import Path +from typing import TYPE_CHECKING, Iterable, List, Optional, Union + +import pypdfium2 as pdfium +from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_parse.document import PageBoundaryType, ParsedPdfPage +from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument +from docling_parse.pdf_parsers import pdf_parser_v2 +from PIL import Image, ImageDraw +from pypdfium2 import PdfPage + +from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docling.datamodel.base_models import Cell, Size + +if TYPE_CHECKING: + from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class DoclingParseV3PageBackend(PdfPageBackend): + def __init__(self, parsed_page: ParsedPdfPage, page_obj: PdfPage): + self._ppage = page_obj + self._dpage = parsed_page + self.valid = parsed_page is not None + + def is_valid(self) -> bool: + return self.valid + + def get_text_in_rect(self, bbox: BoundingBox) -> str: + # Find intersecting cells on the page + text_piece = "" + page_size = self.get_size() + + scale = ( + 1 # FIX - Replace with param in get_text_in_rect across backends (optional) + ) + + for i, cell in enumerate(self._dpage.sanitized.cells): + cell_bbox = ( + cell.rect.to_bounding_box() + .to_top_left_origin(page_height=page_size.height) + .scaled(scale) + ) + + overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area() + + if overlap_frac > 0.5: + if len(text_piece) > 0: + text_piece += " " + text_piece += cell.text + + return text_piece + + def get_text_cells(self) -> Iterable[Cell]: + cells: List[Cell] = [] + cell_counter = 0 + + page_size = self.get_size() + + for i, cell in enumerate(self._dpage.sanitized.cells): + cell_bbox = cell.rect.to_bounding_box() + + if cell_bbox.r < cell_bbox.l: + cell_bbox.r, cell_bbox.l = cell_bbox.l, cell_bbox.r + if cell_bbox.b > cell_bbox.t: + cell_bbox.b, cell_bbox.t = cell_bbox.t, cell_bbox.b + + text_piece = cell.text + cells.append( + Cell( + id=cell_counter, + text=text_piece, + bbox=cell_bbox.to_top_left_origin(page_size.height), + ) + ) + cell_counter += 1 + + def draw_clusters_and_cells(): + image = ( + self.get_page_image() + ) # make new image to avoid drawing on the saved ones + draw = ImageDraw.Draw(image) + for c in cells: + x0, y0, x1, y1 = c.bbox.as_tuple() + cell_color = ( + random.randint(30, 140), + random.randint(30, 140), + random.randint(30, 140), + ) + draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) + image.show() + + # draw_clusters_and_cells() + + return cells + + def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: + AREA_THRESHOLD = 0 # 32 * 32 + + images = self._dpage.sanitized.bitmap_resources + + for img in images: + cropbox = img.rect.to_bounding_box().to_top_left_origin( + self.get_size().height + ) + + if cropbox.area() > AREA_THRESHOLD: + cropbox = cropbox.scaled(scale=scale) + + yield cropbox + + def get_page_image( + self, scale: float = 1, cropbox: Optional[BoundingBox] = None + ) -> Image.Image: + + page_size = self.get_size() + + if not cropbox: + cropbox = BoundingBox( + l=0, + r=page_size.width, + t=0, + b=page_size.height, + coord_origin=CoordOrigin.TOPLEFT, + ) + padbox = BoundingBox( + l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT + ) + else: + padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy() + padbox.r = page_size.width - padbox.r + padbox.t = page_size.height - padbox.t + + image = ( + self._ppage.render( + scale=scale * 1.5, + rotation=0, # no additional rotation + crop=padbox.as_tuple(), + ) + .to_pil() + .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) + ) # We resize the image from 1.5x the given scale to make it sharper. + + return image + + def get_size(self) -> Size: + return Size( + width=self._dpage.sanitized.dimension.width, + height=self._dpage.sanitized.dimension.height, + ) + + def unload(self): + self._ppage = None + self._dpage = None + + +class DoclingParseV3DocumentBackend(PdfDocumentBackend): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + self._pdoc = pdfium.PdfDocument(self.path_or_stream) + self.parser = DoclingPdfParser(loglevel="fatal") + self.dp_doc: PdfDocument = self.parser.load(path_or_stream=path_or_stream) + success = self.dp_doc is not None + + if not success: + raise RuntimeError( + f"docling-parse v2 could not load document {self.document_hash}." + ) + + def page_count(self) -> int: + # return len(self._pdoc) # To be replaced with docling-parse API + + len_1 = len(self._pdoc) + len_2 = self.dp_doc.number_of_pages() + + if len_1 != len_2: + _log.error(f"Inconsistent number of pages: {len_1}!={len_2}") + + return len_2 + + def load_page(self, page_no: int) -> DoclingParseV3PageBackend: + return DoclingParseV3PageBackend( + self.dp_doc.get_page(page_no + 1), self._pdoc[page_no] + ) + + def is_valid(self) -> bool: + return self.page_count() > 0 + + def unload(self): + super().unload() + self.dp_doc.unload() + self._pdoc.close() + self._pdoc = None diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index f6ad92bd..b14d16f1 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -6,10 +6,11 @@ from typing import Iterable import yaml -from docling.datamodel.base_models import ConversionStatus +from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend +from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.settings import settings -from docling.document_converter import DocumentConverter +from docling.document_converter import DocumentConverter, PdfFormatOption _log = logging.getLogger(__name__) @@ -103,10 +104,11 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - Path("./tests/data/2203.01017v2.pdf"), - Path("./tests/data/2305.03393v1.pdf"), - Path("./tests/data/redp5110_sampled.pdf"), + Path("tests/data/redp5110_sampled.pdf"), + # Path("./tests/data/2206.01062.pdf"), + # Path("./tests/data/2203.01017v2.pdf"), + # Path("./tests/data/2305.03393v1.pdf"), + # Path("./tests/data/redp5110_sampled.pdf"), ] # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) @@ -119,13 +121,17 @@ def main(): # settings.debug.visualize_tables = True # settings.debug.visualize_cells = True - doc_converter = DocumentConverter() + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(backend=DoclingParseV3DocumentBackend) + } + ) start_time = time.time() conv_results = doc_converter.convert_all( input_doc_paths, - raises_on_error=False, # to let conversion run through all and examine results at the end + raises_on_error=True, # to let conversion run through all and examine results at the end ) success_count, partial_success_count, failure_count = export_documents( conv_results, output_dir=Path("scratch") diff --git a/tests/test_backend_docling_parse_v3.py b/tests/test_backend_docling_parse_v3.py new file mode 100644 index 00000000..8230e223 --- /dev/null +++ b/tests/test_backend_docling_parse_v3.py @@ -0,0 +1,76 @@ +from pathlib import Path + +import pytest + +from docling.backend.docling_parse_v3_backend import ( + DoclingParseV3DocumentBackend, + DoclingParseV3PageBackend, +) +from docling.datamodel.base_models import BoundingBox, InputFormat +from docling.datamodel.document import InputDocument + + +@pytest.fixture +def test_doc_path(): + return Path("./tests/data/2206.01062.pdf") + + +def _get_backend(pdf_doc): + in_doc = InputDocument( + path_or_stream=pdf_doc, + format=InputFormat.PDF, + backend=DoclingParseV3DocumentBackend, + ) + + doc_backend = in_doc._backend + return doc_backend + + +def test_text_cell_counts(): + pdf_doc = Path("./tests/data/redp5110_sampled.pdf") + + doc_backend = _get_backend(pdf_doc) + + for page_index in range(0, doc_backend.page_count()): + last_cell_count = None + for i in range(10): + page_backend: DoclingParseV3PageBackend = doc_backend.load_page(0) + cells = list(page_backend.get_text_cells()) + + if last_cell_count is None: + last_cell_count = len(cells) + + if len(cells) != last_cell_count: + assert ( + False + ), "Loading page multiple times yielded non-identical text cell counts" + last_cell_count = len(cells) + + +def test_get_text_from_rect(test_doc_path): + doc_backend = _get_backend(test_doc_path) + page_backend: DoclingParseV3PageBackend = doc_backend.load_page(0) + + # Get the title text of the DocLayNet paper + textpiece = page_backend.get_text_in_rect( + bbox=BoundingBox(l=102, t=77, r=511, b=124) + ) + ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" + + assert textpiece.strip() == ref + + +def test_crop_page_image(test_doc_path): + doc_backend = _get_backend(test_doc_path) + page_backend: DoclingParseV3PageBackend = doc_backend.load_page(0) + + # Crop out "Figure 1" from the DocLayNet paper + im = page_backend.get_page_image( + scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) + ) + # im.show() + + +def test_num_pages(test_doc_path): + doc_backend = _get_backend(test_doc_path) + doc_backend.page_count() == 9