From f4117725698f952ccb8e4b29ca17a676958165c7 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 11 Mar 2025 16:06:28 +0100 Subject: [PATCH] Fixes and test updates Signed-off-by: Christoph Auer --- docling/backend/docling_parse_backend.py | 227 -------------------- docling/backend/docling_parse_v2_backend.py | 31 +-- docling/backend/docling_parse_v3_backend.py | 47 +--- docling/backend/pdf_backend.py | 9 +- docling/backend/pypdfium2_backend.py | 80 ++++--- docling/cli/main.py | 10 +- docling/datamodel/base_models.py | 18 +- docling/datamodel/pipeline_options.py | 2 +- docling/models/base_ocr_model.py | 23 +- docling/models/easyocr_model.py | 26 ++- docling/models/ocr_mac_model.py | 16 +- docling/models/page_preprocessing_model.py | 1 + docling/models/rapid_ocr_model.py | 30 ++- docling/models/table_structure_model.py | 17 +- docling/models/tesseract_ocr_cli_model.py | 26 ++- docling/models/tesseract_ocr_model.py | 16 +- docling/utils/export.py | 14 +- docling/utils/layout_postprocessor.py | 47 ++-- docling/utils/visualization.py | 2 +- poetry.lock | 2 +- tests/test_backend_docling_parse.py | 77 ------- tests/test_backend_docling_parse_v3.py | 2 +- tests/test_code_formula.py | 1 - tests/test_e2e_conversion.py | 4 +- tests/test_e2e_ocr_conversion.py | 4 +- tests/test_interfaces.py | 4 +- tests/test_options.py | 4 +- tests/verify_utils.py | 4 +- 28 files changed, 239 insertions(+), 505 deletions(-) delete mode 100644 docling/backend/docling_parse_backend.py delete mode 100644 tests/test_backend_docling_parse.py diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py deleted file mode 100644 index 6d22127b..00000000 --- a/docling/backend/docling_parse_backend.py +++ /dev/null @@ -1,227 +0,0 @@ -import logging -import random -from io import BytesIO -from pathlib import Path -from typing import Iterable, List, Optional, Union - -import pypdfium2 as pdfium -from docling_core.types.doc import BoundingBox, CoordOrigin, Size -from docling_parse.pdf_parsers import pdf_parser_v1 -from PIL import Image, ImageDraw -from pypdfium2 import PdfPage - -from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import Cell -from docling.datamodel.document import InputDocument - -_log = logging.getLogger(__name__) - - -class DoclingParsePageBackend(PdfPageBackend): - def __init__( - self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage - ): - self._ppage = page_obj - parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) - - self.valid = "pages" in parsed_page - if self.valid: - self._dpage = parsed_page["pages"][0] - else: - _log.info( - f"An error occurred when loading page {page_no} of document {document_hash}." - ) - - def is_valid(self) -> bool: - return self.valid - - def get_text_in_rect(self, bbox: BoundingBox) -> str: - if not self.valid: - return "" - # Find intersecting cells on the page - text_piece = "" - page_size = self.get_size() - parser_width = self._dpage["width"] - parser_height = self._dpage["height"] - - scale = ( - 1 # FIX - Replace with param in get_text_in_rect across backends (optional) - ) - - for i in range(len(self._dpage["cells"])): - rect = self._dpage["cells"][i]["box"]["device"] - x0, y0, x1, y1 = rect - cell_bbox = BoundingBox( - l=x0 * scale * page_size.width / parser_width, - b=y0 * scale * page_size.height / parser_height, - r=x1 * scale * page_size.width / parser_width, - t=y1 * scale * page_size.height / parser_height, - coord_origin=CoordOrigin.BOTTOMLEFT, - ).to_top_left_origin(page_height=page_size.height * scale) - - overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area() - - if overlap_frac > 0.5: - if len(text_piece) > 0: - text_piece += " " - text_piece += self._dpage["cells"][i]["content"]["rnormalized"] - - return text_piece - - def get_text_cells(self) -> Iterable[Cell]: - cells: List[Cell] = [] - cell_counter = 0 - - if not self.valid: - return cells - - page_size = self.get_size() - - parser_width = self._dpage["width"] - parser_height = self._dpage["height"] - - for i in range(len(self._dpage["cells"])): - rect = self._dpage["cells"][i]["box"]["device"] - x0, y0, x1, y1 = rect - - if x1 < x0: - x0, x1 = x1, x0 - if y1 < y0: - y0, y1 = y1, y0 - - text_piece = self._dpage["cells"][i]["content"]["rnormalized"] - cells.append( - Cell( - id=cell_counter, - text=text_piece, - bbox=BoundingBox( - # l=x0, b=y0, r=x1, t=y1, - l=x0 * page_size.width / parser_width, - b=y0 * page_size.height / parser_height, - r=x1 * page_size.width / parser_width, - t=y1 * page_size.height / parser_height, - coord_origin=CoordOrigin.BOTTOMLEFT, - ).to_top_left_origin(page_size.height), - ) - ) - cell_counter += 1 - - def draw_clusters_and_cells(): - image = ( - self.get_page_image() - ) # make new image to avoid drawing on the saved ones - draw = ImageDraw.Draw(image) - for c in cells: - x0, y0, x1, y1 = c.bbox.as_tuple() - cell_color = ( - random.randint(30, 140), - random.randint(30, 140), - random.randint(30, 140), - ) - draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) - image.show() - - # before merge: - # draw_clusters_and_cells() - - # cells = merge_horizontal_cells(cells) - - # after merge: - # draw_clusters_and_cells() - - return cells - - def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: - AREA_THRESHOLD = 0 # 32 * 32 - - for i in range(len(self._dpage["images"])): - bitmap = self._dpage["images"][i] - cropbox = BoundingBox.from_tuple( - bitmap["box"], origin=CoordOrigin.BOTTOMLEFT - ).to_top_left_origin(self.get_size().height) - - if cropbox.area() > AREA_THRESHOLD: - cropbox = cropbox.scaled(scale=scale) - - yield cropbox - - def get_page_image( - self, scale: float = 1, cropbox: Optional[BoundingBox] = None - ) -> Image.Image: - - page_size = self.get_size() - - if not cropbox: - cropbox = BoundingBox( - l=0, - r=page_size.width, - t=0, - b=page_size.height, - coord_origin=CoordOrigin.TOPLEFT, - ) - padbox = BoundingBox( - l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT - ) - else: - padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy() - padbox.r = page_size.width - padbox.r - padbox.t = page_size.height - padbox.t - - image = ( - self._ppage.render( - scale=scale * 1.5, - rotation=0, # no additional rotation - crop=padbox.as_tuple(), - ) - .to_pil() - .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) - ) # We resize the image from 1.5x the given scale to make it sharper. - - return image - - def get_size(self) -> Size: - return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) - - def unload(self): - self._ppage = None - self._dpage = None - - -class DoclingParseDocumentBackend(PdfDocumentBackend): - def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): - super().__init__(in_doc, path_or_stream) - - self._pdoc = pdfium.PdfDocument(self.path_or_stream) - self.parser = pdf_parser_v1() - - success = False - if isinstance(self.path_or_stream, BytesIO): - success = self.parser.load_document_from_bytesio( - self.document_hash, self.path_or_stream - ) - elif isinstance(self.path_or_stream, Path): - success = self.parser.load_document( - self.document_hash, str(self.path_or_stream) - ) - - if not success: - raise RuntimeError( - f"docling-parse could not load document with hash {self.document_hash}." - ) - - def page_count(self) -> int: - return len(self._pdoc) # To be replaced with docling-parse API - - def load_page(self, page_no: int) -> DoclingParsePageBackend: - return DoclingParsePageBackend( - self.parser, self.document_hash, page_no, self._pdoc[page_no] - ) - - def is_valid(self) -> bool: - return self.page_count() > 0 - - def unload(self): - super().unload() - self.parser.unload_document(self.document_hash) - self._pdoc.close() - self._pdoc = None diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 9178883f..96525ba3 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -6,12 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from docling_parse.pdf_parsers import pdf_parser_v2 from PIL import Image, ImageDraw from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import Cell, Size +from docling.datamodel.base_models import Size from docling.utils.locks import pypdfium2_lock if TYPE_CHECKING: @@ -78,8 +79,11 @@ class DoclingParseV2PageBackend(PdfPageBackend): return text_piece - def get_text_cells(self) -> Iterable[Cell]: - cells: List[Cell] = [] + def get_segmented_page(self) -> Optional[SegmentedPdfPage]: + return None + + def get_text_cells(self) -> Iterable[TextCell]: + cells: List[TextCell] = [] cell_counter = 0 if not self.valid: @@ -106,16 +110,19 @@ class DoclingParseV2PageBackend(PdfPageBackend): text_piece = cell_data[cells_header.index("text")] cells.append( - Cell( - id=cell_counter, + TextCell( + index=cell_counter, text=text_piece, - bbox=BoundingBox( - # l=x0, b=y0, r=x1, t=y1, - l=x0 * page_size.width / parser_width, - b=y0 * page_size.height / parser_height, - r=x1 * page_size.width / parser_width, - t=y1 * page_size.height / parser_height, - coord_origin=CoordOrigin.BOTTOMLEFT, + orig=text_piece, + rect=BoundingRectangle.from_bounding_box( + BoundingBox( + # l=x0, b=y0, r=x1, t=y1, + l=x0 * page_size.width / parser_width, + b=y0 * page_size.height / parser_height, + r=x1 * page_size.width / parser_width, + t=y1 * page_size.height / parser_height, + coord_origin=CoordOrigin.BOTTOMLEFT, + ) ).to_top_left_origin(page_size.height), ) ) diff --git a/docling/backend/docling_parse_v3_backend.py b/docling/backend/docling_parse_v3_backend.py index 7ff451ee..0ebeafd3 100644 --- a/docling/backend/docling_parse_v3_backend.py +++ b/docling/backend/docling_parse_v3_backend.py @@ -6,13 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_core.types.doc.page import SegmentedPdfPage +from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument from PIL import Image, ImageDraw from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import Cell, Size +from docling.datamodel.base_models import Size if TYPE_CHECKING: from docling.datamodel.document import InputDocument @@ -54,48 +54,15 @@ class DoclingParseV3PageBackend(PdfPageBackend): return text_piece - def get_text_cells(self) -> Iterable[Cell]: - cells: List[Cell] = [] - cell_counter = 0 + def get_segmented_page(self) -> Optional[SegmentedPdfPage]: + return self._dpage + def get_text_cells(self) -> Iterable[TextCell]: page_size = self.get_size() - for i, cell in enumerate(self._dpage.textline_cells): - cell_bbox = cell.rect.to_bounding_box() + [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells] - if cell_bbox.r < cell_bbox.l: - cell_bbox.r, cell_bbox.l = cell_bbox.l, cell_bbox.r - if cell_bbox.b > cell_bbox.t: - cell_bbox.b, cell_bbox.t = cell_bbox.t, cell_bbox.b - - text_piece = cell.text - cells.append( - Cell( - id=cell_counter, - text=text_piece, - bbox=cell_bbox.to_top_left_origin(page_size.height), - ) - ) - cell_counter += 1 - - def draw_clusters_and_cells(): - image = ( - self.get_page_image() - ) # make new image to avoid drawing on the saved ones - draw = ImageDraw.Draw(image) - for c in cells: - x0, y0, x1, y1 = c.bbox.as_tuple() - cell_color = ( - random.randint(30, 140), - random.randint(30, 140), - random.randint(30, 140), - ) - draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) - image.show() - - # draw_clusters_and_cells() - - return cells + return self._dpage.textline_cells def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: AREA_THRESHOLD = 0 # 32 * 32 diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py index 35c83b8c..cfecc7e6 100644 --- a/docling/backend/pdf_backend.py +++ b/docling/backend/pdf_backend.py @@ -4,10 +4,11 @@ from pathlib import Path from typing import Iterable, Optional, Set, Union from docling_core.types.doc import BoundingBox, Size +from docling_core.types.doc.page import SegmentedPdfPage, TextCell from PIL import Image from docling.backend.abstract_backend import PaginatedDocumentBackend -from docling.datamodel.base_models import Cell, InputFormat +from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument @@ -17,7 +18,11 @@ class PdfPageBackend(ABC): pass @abstractmethod - def get_text_cells(self) -> Iterable[Cell]: + def get_segmented_page(self) -> Optional[SegmentedPdfPage]: + pass + + @abstractmethod + def get_text_cells(self) -> Iterable[TextCell]: pass @abstractmethod diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index b585e2d5..5a5903de 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c from docling_core.types.doc import BoundingBox, CoordOrigin, Size +from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from PIL import Image, ImageDraw from pypdfium2 import PdfTextPage from pypdfium2._helpers.misc import PdfiumError from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import Cell from docling.utils.locks import pypdfium2_lock if TYPE_CHECKING: @@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend): return text_piece - def get_text_cells(self) -> Iterable[Cell]: + def get_segmented_page(self) -> Optional[SegmentedPdfPage]: + return None + + def get_text_cells(self) -> Iterable[TextCell]: with pypdfium2_lock: if not self.text_page: self.text_page = self._ppage.get_textpage() @@ -84,11 +87,18 @@ class PyPdfiumPageBackend(PdfPageBackend): text_piece = self.text_page.get_text_bounded(*rect) x0, y0, x1, y1 = rect cells.append( - Cell( - id=cell_counter, + TextCell( + index=cell_counter, text=text_piece, - bbox=BoundingBox( - l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT + orig=text_piece, + rect=BoundingRectangle.from_bounding_box( + BoundingBox( + l=x0, + b=y0, + r=x1, + t=y1, + coord_origin=CoordOrigin.BOTTOMLEFT, + ) ).to_top_left_origin(page_size.height), ) ) @@ -97,51 +107,56 @@ class PyPdfiumPageBackend(PdfPageBackend): # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs. # The cell merging code below is to clean this up. def merge_horizontal_cells( - cells: List[Cell], + cells: List[TextCell], horizontal_threshold_factor: float = 1.0, vertical_threshold_factor: float = 0.5, - ) -> List[Cell]: + ) -> List[TextCell]: if not cells: return [] - def group_rows(cells: List[Cell]) -> List[List[Cell]]: + def group_rows(cells: List[TextCell]) -> List[List[TextCell]]: rows = [] current_row = [cells[0]] - row_top = cells[0].bbox.t - row_bottom = cells[0].bbox.b - row_height = cells[0].bbox.height + row_top = cells[0].rect.to_bounding_box().t + row_bottom = cells[0].rect.to_bounding_box().b + row_height = cells[0].rect.to_bounding_box().height for cell in cells[1:]: vertical_threshold = row_height * vertical_threshold_factor if ( - abs(cell.bbox.t - row_top) <= vertical_threshold - and abs(cell.bbox.b - row_bottom) <= vertical_threshold + abs(cell.rect.to_bounding_box().t - row_top) + <= vertical_threshold + and abs(cell.rect.to_bounding_box().b - row_bottom) + <= vertical_threshold ): current_row.append(cell) - row_top = min(row_top, cell.bbox.t) - row_bottom = max(row_bottom, cell.bbox.b) + row_top = min(row_top, cell.rect.to_bounding_box().t) + row_bottom = max(row_bottom, cell.rect.to_bounding_box().b) row_height = row_bottom - row_top else: rows.append(current_row) current_row = [cell] - row_top = cell.bbox.t - row_bottom = cell.bbox.b - row_height = cell.bbox.height + row_top = cell.rect.to_bounding_box().t + row_bottom = cell.rect.to_bounding_box().b + row_height = cell.rect.to_bounding_box().height if current_row: rows.append(current_row) return rows - def merge_row(row: List[Cell]) -> List[Cell]: + def merge_row(row: List[TextCell]) -> List[TextCell]: merged = [] current_group = [row[0]] for cell in row[1:]: prev_cell = current_group[-1] - avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2 + avg_height = ( + prev_cell.rect.height + cell.rect.to_bounding_box().height + ) / 2 if ( - cell.bbox.l - prev_cell.bbox.r + cell.rect.to_bounding_box().l + - prev_cell.rect.to_bounding_box().r <= avg_height * horizontal_threshold_factor ): current_group.append(cell) @@ -154,24 +169,29 @@ class PyPdfiumPageBackend(PdfPageBackend): return merged - def merge_group(group: List[Cell]) -> Cell: + def merge_group(group: List[TextCell]) -> TextCell: if len(group) == 1: return group[0] merged_text = "".join(cell.text for cell in group) merged_bbox = BoundingBox( - l=min(cell.bbox.l for cell in group), - t=min(cell.bbox.t for cell in group), - r=max(cell.bbox.r for cell in group), - b=max(cell.bbox.b for cell in group), + l=min(cell.rect.to_bounding_box().l for cell in group), + t=min(cell.rect.to_bounding_box().t for cell in group), + r=max(cell.rect.to_bounding_box().r for cell in group), + b=max(cell.rect.to_bounding_box().b for cell in group), + ) + return TextCell( + index=group[0].index, + text=merged_text, + orig=merged_text, + rect=BoundingRectangle.from_bounding_box(merged_bbox), ) - return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox) rows = group_rows(cells) merged_cells = [cell for row in rows for cell in merge_row(row)] for i, cell in enumerate(merged_cells, 1): - cell.id = i + cell.index = i return merged_cells @@ -181,7 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend): ) # make new image to avoid drawing on the saved ones draw = ImageDraw.Draw(image) for c in cells: - x0, y0, x1, y1 = c.bbox.as_tuple() + x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple() cell_color = ( random.randint(30, 140), random.randint(30, 140), diff --git a/docling/cli/main.py b/docling/cli/main.py index a2c28fd7..fc3e1a9b 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -14,8 +14,8 @@ from docling_core.types.doc import ImageRefMode from docling_core.utils.file import resolve_source_to_path from pydantic import TypeAdapter -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ( @@ -412,12 +412,12 @@ def convert( if artifacts_path is not None: pipeline_options.artifacts_path = artifacts_path - if pdf_backend == PdfBackend.DLPARSE_V1: - backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend - elif pdf_backend == PdfBackend.DLPARSE_V2: + if pdf_backend == PdfBackend.DLPARSE_V2: backend = DoclingParseV2DocumentBackend + elif pdf_backend == PdfBackend.DLPARSE_V3: + backend = DoclingParseV3DocumentBackend # type: ignore elif pdf_backend == PdfBackend.PYPDFIUM2: - backend = PyPdfiumDocumentBackend + backend = PyPdfiumDocumentBackend # type: ignore else: raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 3297c9a5..76827a1b 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -9,6 +9,7 @@ from docling_core.types.doc import ( Size, TableCell, ) +from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location DocumentStream, ) @@ -123,14 +124,10 @@ class ErrorItem(BaseModel): error_message: str -class Cell(BaseModel): - id: int - text: str - bbox: BoundingBox - - -class OcrCell(Cell): - confidence: float +# class Cell(BaseModel): +# id: int +# text: str +# bbox: BoundingBox class Cluster(BaseModel): @@ -138,7 +135,7 @@ class Cluster(BaseModel): label: DocItemLabel bbox: BoundingBox confidence: float = 1.0 - cells: List[Cell] = [] + cells: List[TextCell] = [] children: List["Cluster"] = [] # Add child cluster support @@ -226,7 +223,8 @@ class Page(BaseModel): page_no: int # page_hash: Optional[str] = None size: Optional[Size] = None - cells: List[Cell] = [] + cells: List[TextCell] = [] + parsed_page: Optional[SegmentedPdfPage] = None predictions: PagePredictions = PagePredictions() assembled: Optional[AssembledUnit] = None diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 3a55ecfc..43fa6c7e 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -299,8 +299,8 @@ class PdfBackend(str, Enum): """Enum of valid PDF backends.""" PYPDFIUM2 = "pypdfium2" - DLPARSE_V1 = "dlparse_v1" DLPARSE_V2 = "dlparse_v2" + DLPARSE_V3 = "dlparse_v3" # Define an enum for the ocr engines diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 9afb7dde..4b153ff6 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -6,11 +6,12 @@ from typing import Iterable, List import numpy as np from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell from PIL import Image, ImageDraw from rtree import index from scipy.ndimage import binary_dilation, find_objects, label -from docling.datamodel.base_models import Cell, OcrCell, Page +from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import OcrOptions from docling.datamodel.settings import settings @@ -104,11 +105,13 @@ class BaseOcrModel(BasePageModel): p.dimension = 2 idx = index.Index(properties=p) for i, cell in enumerate(programmatic_cells): - idx.insert(i, cell.bbox.as_tuple()) + idx.insert(i, cell.rect.to_bounding_box().as_tuple()) def is_overlapping_with_existing_cells(ocr_cell): # Query the R-tree to get overlapping rectangles - possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple())) + possible_matches_index = list( + idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple()) + ) return ( len(possible_matches_index) > 0 @@ -125,10 +128,7 @@ class BaseOcrModel(BasePageModel): """ if self.options.force_full_page_ocr: # If a full page OCR is forced, use only the OCR cells - cells = [ - Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox) - for c_ocr in ocr_cells - ] + cells = ocr_cells return cells ## Remove OCR cells which overlap with programmatic cells. @@ -156,7 +156,7 @@ class BaseOcrModel(BasePageModel): # Draw OCR and programmatic cells for tc in page.cells: - x0, y0, x1, y1 = tc.bbox.as_tuple() + x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple() y0 *= scale_x y1 *= scale_y x0 *= scale_x @@ -165,9 +165,10 @@ class BaseOcrModel(BasePageModel): if y1 <= y0: y1, y0 = y0, y1 - color = "gray" - if isinstance(tc, OcrCell): - color = "magenta" + color = "magenta" + if isinstance(tc, PdfTextCell): + color = "gray" + draw.rectangle([(x0, y0), (x1, y1)], outline=color) if show: diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 0eccb988..59b9f2ba 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -6,8 +6,9 @@ from typing import Iterable, List, Optional import numpy from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import BoundingRectangle, TextCell -from docling.datamodel.base_models import Cell, OcrCell, Page +from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( AcceleratorDevice, @@ -148,18 +149,21 @@ class EasyOcrModel(BaseOcrModel): del im cells = [ - OcrCell( - id=ix, + TextCell( + index=ix, text=line[1], + orig=line[1], confidence=line[2], - bbox=BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, + rect=BoundingRectangle.from_bounding_box( + BoundingBox.from_tuple( + coord=( + (line[0][0][0] / self.scale) + ocr_rect.l, + (line[0][0][1] / self.scale) + ocr_rect.t, + (line[0][2][0] / self.scale) + ocr_rect.l, + (line[0][2][1] / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ) ), ) for ix, line in enumerate(result) diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py index 38bcf1ca..74b25a73 100644 --- a/docling/models/ocr_mac_model.py +++ b/docling/models/ocr_mac_model.py @@ -3,8 +3,9 @@ import tempfile from typing import Iterable, Optional, Tuple from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import BoundingRectangle, TextCell -from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import OcrMacOptions from docling.datamodel.settings import settings @@ -94,13 +95,16 @@ class OcrMacModel(BaseOcrModel): bottom = y2 / self.scale cells.append( - OcrCell( - id=ix, + TextCell( + index=ix, text=text, + orig=text, confidence=confidence, - bbox=BoundingBox.from_tuple( - coord=(left, top, right, bottom), - origin=CoordOrigin.TOPLEFT, + rect=BoundingRectangle.from_bounding_box( + BoundingBox.from_tuple( + coord=(left, top, right, bottom), + origin=CoordOrigin.TOPLEFT, + ) ), ) ) diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 63f1a4f6..2ac52be1 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -54,6 +54,7 @@ class PagePreprocessingModel(BasePageModel): assert page._backend is not None page.cells = list(page._backend.get_text_cells()) + page.parsed_page = page._backend.get_segmented_page() # DEBUG code: def draw_text_boxes(image, cells, show: bool = False): diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py index fa3fbedf..f13fd6cc 100644 --- a/docling/models/rapid_ocr_model.py +++ b/docling/models/rapid_ocr_model.py @@ -3,8 +3,9 @@ from typing import Iterable import numpy from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import BoundingRectangle, TextCell -from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( AcceleratorDevice, @@ -100,18 +101,25 @@ class RapidOcrModel(BaseOcrModel): if result is not None: cells = [ - OcrCell( - id=ix, + TextCell( + index=ix, text=line[1], + orig=line[1], confidence=line[2], - bbox=BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, + rect=BoundingRectangle.from_bounding_box( + BoundingBox.from_tuple( + coord=( + (line[0][0][0] / self.scale) + + ocr_rect.l, + (line[0][0][1] / self.scale) + + ocr_rect.t, + (line[0][2][0] / self.scale) + + ocr_rect.l, + (line[0][2][1] / self.scale) + + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ) ), ) for ix, line in enumerate(result) diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 64979157..c0225a0d 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union import numpy from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell +from docling_core.types.doc.page import BoundingRectangle from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from PIL import ImageDraw @@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel): draw.rectangle([(x0, y0), (x1, y1)], outline="red") for cell in table_element.cluster.cells: - x0, y0, x1, y1 = cell.bbox.as_tuple() + x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple() x0 *= scale_x x1 *= scale_x y0 *= scale_x @@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel): # Only allow non empty stings (spaces) into the cells of a table if len(c.text.strip()) > 0: new_cell = copy.deepcopy(c) - new_cell.bbox = new_cell.bbox.scaled( - scale=self.scale + new_cell.rect = BoundingRectangle.from_bounding_box( + new_cell.rect.to_bounding_box().scaled( + scale=self.scale + ) ) - tokens.append(new_cell.model_dump()) + tokens.append( + { + "id": new_cell.index, + "text": new_cell.text, + "bbox": new_cell.rect.to_bounding_box().model_dump(), + } + ) page_input["tokens"] = tokens tf_output = self.tf_predictor.multi_table_predict( diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index ac8dd51f..a4744c52 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -8,8 +8,9 @@ from typing import Iterable, List, Optional, Tuple import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import BoundingRectangle, TextCell -from docling.datamodel.base_models import Cell, OcrCell, Page +from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.settings import settings @@ -228,18 +229,21 @@ class TesseractOcrCliModel(BaseOcrModel): t = b + h r = l + w - cell = OcrCell( - id=ix, + cell = TextCell( + index=ix, text=text, + orig=text, confidence=conf / 100.0, - bbox=BoundingBox.from_tuple( - coord=( - (l / self.scale) + ocr_rect.l, - (b / self.scale) + ocr_rect.t, - (r / self.scale) + ocr_rect.l, - (t / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, + rect=BoundingRectangle.from_bounding_box( + BoundingBox.from_tuple( + coord=( + (l / self.scale) + ocr_rect.l, + (b / self.scale) + ocr_rect.t, + (r / self.scale) + ocr_rect.l, + (t / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ) ), ) all_ocr_cells.append(cell) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index c41806f5..7ee6d377 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -2,8 +2,9 @@ import logging from typing import Iterable from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import BoundingRectangle, TextCell -from docling.datamodel.base_models import Cell, OcrCell, Page +from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.settings import settings @@ -173,13 +174,16 @@ class TesseractOcrModel(BaseOcrModel): top = (box["y"] + box["h"]) / self.scale cells.append( - OcrCell( - id=ix, + TextCell( + index=ix, text=text, + orig=text, confidence=confidence, - bbox=BoundingBox.from_tuple( - coord=(left, top, right, bottom), - origin=CoordOrigin.TOPLEFT, + rect=BoundingRectangle.from_bounding_box( + BoundingBox.from_tuple( + coord=(left, top, right, bottom), + origin=CoordOrigin.TOPLEFT, + ), ), ) ) diff --git a/docling/utils/export.py b/docling/utils/export.py index 5b022f4a..d480c664 100644 --- a/docling/utils/export.py +++ b/docling/utils/export.py @@ -2,9 +2,9 @@ import logging from typing import Any, Dict, Iterable, List, Tuple, Union from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import TextCell from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table -from docling.datamodel.base_models import OcrCell from docling.datamodel.document import ConversionResult, Page _log = logging.getLogger(__name__) @@ -86,11 +86,13 @@ def generate_multimodal_pages( if page.size is None: return cells for cell in page.cells: - new_bbox = cell.bbox.to_top_left_origin( - page_height=page.size.height - ).normalized(page_size=page.size) - is_ocr = isinstance(cell, OcrCell) - ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0 + new_bbox = ( + cell.rect.to_bounding_box() + .to_top_left_origin(page_height=page.size.height) + .normalized(page_size=page.size) + ) + is_ocr = isinstance(cell, TextCell) + ocr_confidence = cell.confidence if isinstance(cell, TextCell) else 1.0 cells.append( { "text": cell.text, diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py index e2b950f4..771b4207 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docling/utils/layout_postprocessor.py @@ -5,9 +5,10 @@ from collections import defaultdict from typing import Dict, List, Set, Tuple from docling_core.types.doc import DocItemLabel, Size +from docling_core.types.doc.page import TextCell from rtree import index -from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell +from docling.datamodel.base_models import BoundingBox, Cluster _log = logging.getLogger(__name__) @@ -198,7 +199,7 @@ class LayoutPostprocessor: DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER, } - def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size): + def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size): """Initialize processor with cells and clusters.""" """Initialize processor with cells and spatial indices.""" self.cells = cells @@ -218,7 +219,7 @@ class LayoutPostprocessor: [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES] ) - def postprocess(self) -> Tuple[List[Cluster], List[Cell]]: + def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]: """Main processing pipeline.""" self.regular_clusters = self._process_regular_clusters() self.special_clusters = self._process_special_clusters() @@ -272,14 +273,14 @@ class LayoutPostprocessor: orphan_clusters = [] for i, cell in enumerate(unassigned): conf = 1.0 - if isinstance(cell, OcrCell): + if isinstance(cell, TextCell): conf = cell.confidence orphan_clusters.append( Cluster( id=next_id + i, label=DocItemLabel.TEXT, - bbox=cell.bbox, + bbox=cell.to_bounding_box(), confidence=conf, cells=[cell], ) @@ -557,13 +558,13 @@ class LayoutPostprocessor: return current_best if current_best else clusters[0] - def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]: + def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]: """Ensure each cell appears only once, maintaining order of first appearance.""" seen_ids = set() unique_cells = [] for cell in cells: - if cell.id not in seen_ids: - seen_ids.add(cell.id) + if cell.index not in seen_ids: + seen_ids.add(cell.index) unique_cells.append(cell) return unique_cells @@ -582,11 +583,13 @@ class LayoutPostprocessor: best_cluster = None for cluster in clusters: - if cell.bbox.area() <= 0: + if cell.rect.to_bounding_box().area() <= 0: continue - overlap = cell.bbox.intersection_area_with(cluster.bbox) - overlap_ratio = overlap / cell.bbox.area() + overlap = cell.rect.to_bounding_box().intersection_area_with( + cluster.bbox + ) + overlap_ratio = overlap / cell.rect.to_bounding_box().area() if overlap_ratio > best_overlap: best_overlap = overlap_ratio @@ -601,11 +604,13 @@ class LayoutPostprocessor: return clusters - def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]: + def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]: """Find cells not assigned to any cluster.""" - assigned = {cell.id for cluster in clusters for cell in cluster.cells} + assigned = {cell.index for cluster in clusters for cell in cluster.cells} return [ - cell for cell in self.cells if cell.id not in assigned and cell.text.strip() + cell + for cell in self.cells + if cell.index not in assigned and cell.text.strip() ] def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]: @@ -615,10 +620,10 @@ class LayoutPostprocessor: continue cells_bbox = BoundingBox( - l=min(cell.bbox.l for cell in cluster.cells), - t=min(cell.bbox.t for cell in cluster.cells), - r=max(cell.bbox.r for cell in cluster.cells), - b=max(cell.bbox.b for cell in cluster.cells), + l=min(cell.rect.to_bounding_box().l for cell in cluster.cells), + t=min(cell.rect.to_bounding_box().t for cell in cluster.cells), + r=max(cell.rect.to_bounding_box().r for cell in cluster.cells), + b=max(cell.rect.to_bounding_box().b for cell in cluster.cells), ) if cluster.label == DocItemLabel.TABLE: @@ -634,9 +639,9 @@ class LayoutPostprocessor: return clusters - def _sort_cells(self, cells: List[Cell]) -> List[Cell]: + def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]: """Sort cells in native reading order.""" - return sorted(cells, key=lambda c: (c.id)) + return sorted(cells, key=lambda c: (c.index)) def _sort_clusters( self, clusters: List[Cluster], mode: str = "id" @@ -647,7 +652,7 @@ class LayoutPostprocessor: clusters, key=lambda cluster: ( ( - min(cell.id for cell in cluster.cells) + min(cell.index for cell in cluster.cells) if cluster.cells else sys.maxsize ), diff --git a/docling/utils/visualization.py b/docling/utils/visualization.py index e7ea24a5..6c4815fa 100644 --- a/docling/utils/visualization.py +++ b/docling/utils/visualization.py @@ -25,7 +25,7 @@ def draw_clusters( # Draw cells first (underneath) cell_color = (0, 0, 0, 40) # Transparent black for cells for tc in c.cells: - cx0, cy0, cx1, cy1 = tc.bbox.as_tuple() + cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple() cx0 *= scale_x cx1 *= scale_x cy0 *= scale_x diff --git a/poetry.lock b/poetry.lock index 9c62c494..cccabac7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -898,7 +898,7 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"] type = "git" url = "https://github.com/DS4SD/docling-core" reference = "cau/docling-parse-types" -resolved_reference = "31db5b0225a4baa8be5f26cc50050cf4bc845204" +resolved_reference = "5f404c0270408ba794c18f8d6923cfa9f2980d73" [[package]] name = "docling-ibm-models" diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py deleted file mode 100644 index 3c214791..00000000 --- a/tests/test_backend_docling_parse.py +++ /dev/null @@ -1,77 +0,0 @@ -from pathlib import Path - -import pytest -from docling_core.types.doc import BoundingBox - -from docling.backend.docling_parse_backend import ( - DoclingParseDocumentBackend, - DoclingParsePageBackend, -) -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument - - -@pytest.fixture -def test_doc_path(): - return Path("./tests/data/pdf/2206.01062.pdf") - - -def _get_backend(pdf_doc): - in_doc = InputDocument( - path_or_stream=pdf_doc, - format=InputFormat.PDF, - backend=DoclingParseDocumentBackend, - ) - - doc_backend = in_doc._backend - return doc_backend - - -def test_text_cell_counts(): - pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") - - doc_backend = _get_backend(pdf_doc) - - for page_index in range(0, doc_backend.page_count()): - last_cell_count = None - for i in range(10): - page_backend: DoclingParsePageBackend = doc_backend.load_page(0) - cells = list(page_backend.get_text_cells()) - - if last_cell_count is None: - last_cell_count = len(cells) - - if len(cells) != last_cell_count: - assert ( - False - ), "Loading page multiple times yielded non-identical text cell counts" - last_cell_count = len(cells) - - -def test_get_text_from_rect(test_doc_path): - doc_backend = _get_backend(test_doc_path) - page_backend: DoclingParsePageBackend = doc_backend.load_page(0) - - # Get the title text of the DocLayNet paper - textpiece = page_backend.get_text_in_rect( - bbox=BoundingBox(l=102, t=77, r=511, b=124) - ) - ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - - assert textpiece.strip() == ref - - -def test_crop_page_image(test_doc_path): - doc_backend = _get_backend(test_doc_path) - page_backend: DoclingParsePageBackend = doc_backend.load_page(0) - - # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image( - scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) - ) - # im.show() - - -def test_num_pages(test_doc_path): - doc_backend = _get_backend(test_doc_path) - doc_backend.page_count() == 9 diff --git a/tests/test_backend_docling_parse_v3.py b/tests/test_backend_docling_parse_v3.py index 8230e223..17a951c0 100644 --- a/tests/test_backend_docling_parse_v3.py +++ b/tests/test_backend_docling_parse_v3.py @@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("./tests/data/2206.01062.pdf") + return Path("./tests/data/pdf/2206.01062.pdf") def _get_backend(pdf_doc): diff --git a/tests/test_code_formula.py b/tests/test_code_formula.py index ac7a1587..a607c09d 100644 --- a/tests/test_code_formula.py +++ b/tests/test_code_formula.py @@ -3,7 +3,6 @@ from pathlib import Path from docling_core.types.doc import CodeItem, TextItem from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index d2215d61..427c0c84 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -1,6 +1,6 @@ from pathlib import Path -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions @@ -33,7 +33,7 @@ def get_converter(): converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( - pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend + pipeline_options=pipeline_options, backend=DoclingParseV3DocumentBackend ) } ) diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 62e4c855..8c75b1b3 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -2,7 +2,7 @@ import sys from pathlib import Path from typing import List -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( @@ -44,7 +44,7 @@ def get_converter(ocr_options: OcrOptions): format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, - backend=DoclingParseDocumentBackend, + backend=DoclingParseV3DocumentBackend, ) } ) diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index 1978bc74..dcc6e510 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -3,7 +3,7 @@ from pathlib import Path import pytest -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption @@ -30,7 +30,7 @@ def converter(): converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( - pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend + pipeline_options=pipeline_options, backend=DoclingParseV3DocumentBackend ) } ) diff --git a/tests/test_options.py b/tests/test_options.py index c8701a1b..ffb114a9 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -3,7 +3,7 @@ from pathlib import Path import pytest -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( @@ -33,7 +33,7 @@ def get_converters_with_table_options(): format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, - backend=DoclingParseDocumentBackend, + backend=DoclingParseV3DocumentBackend, ) } ) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 45152e0a..02861a8b 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -79,8 +79,8 @@ def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]): pred_text = cell_pred_item.text assert true_text == pred_text, f"{true_text}!={pred_text}" - true_bbox = cell_true_item.bbox.as_tuple() - pred_bbox = cell_pred_item.bbox.as_tuple() + true_bbox = cell_true_item.rect.to_bounding_box().as_tuple() + pred_bbox = cell_pred_item.rect.to_bounding_box().as_tuple() assert ( true_bbox == pred_bbox ), f"bbox is not the same: {true_bbox} != {pred_bbox}"