From d654568ad90b50cf88102d7ef405b75bfdbd4a40 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 14 Mar 2025 13:32:37 +0100 Subject: [PATCH] Test all backends, fixes Signed-off-by: Christoph Auer --- docling/backend/docling_parse_backend.py | 4 +-- docling/backend/docling_parse_v4_backend.py | 28 +++++++------------ docling/cli/main.py | 4 ++- docling/document_converter.py | 10 +++---- tests/test_code_formula.py | 3 +-- tests/test_document_picture_classifier.py | 3 +-- tests/test_e2e_conversion.py | 2 +- tests/test_e2e_ocr_conversion.py | 1 - tests/test_interfaces.py | 2 +- tests/test_options.py | 30 ++++++++++++++++++++- 10 files changed, 52 insertions(+), 35 deletions(-) diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index ab63db98..533ed429 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -108,8 +108,8 @@ class DoclingParsePageBackend(PdfPageBackend): t=y1 * page_size.height / parser_height, coord_origin=CoordOrigin.BOTTOMLEFT, ) - ), - ).to_top_left_origin(page_size.height), + ).to_top_left_origin(page_size.height), + ) ) cell_counter += 1 diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py index acea79e4..e98d2ecf 100644 --- a/docling/backend/docling_parse_v4_backend.py +++ b/docling/backend/docling_parse_v4_backend.py @@ -62,25 +62,15 @@ class DoclingParseV4PageBackend(PdfPageBackend): [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells] - for cell in self._dpage.textline_cells: - rect = cell.rect - - # if rect.r_x2 < rect.r_x0: - # rect.r_x0, rect.r_x2 = rect.r_x2, rect.r_x0 - # rect.r_y3, rect.r_y1 = rect.r_y1, rect.r_y3 - - # rect.r_x2, rect.r_x3 = rect.r_x3, rect.r_x2 - - # if rect.r_y2 > rect.r_y0: - # rect.r_y2, rect.r_y0 = rect.r_y0, rect.r_y2 - # rect.r_y3, rect.r_y1 = rect.r_y1, rect.r_y3 - - assert ( - rect.to_bounding_box().l <= rect.to_bounding_box().r - ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}" - assert ( - rect.to_bounding_box().t <= rect.to_bounding_box().b - ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}" + # for cell in self._dpage.textline_cells: + # rect = cell.rect + # + # assert ( + # rect.to_bounding_box().l <= rect.to_bounding_box().r + # ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}" + # assert ( + # rect.to_bounding_box().t <= rect.to_bounding_box().b + # ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}" return self._dpage.textline_cells diff --git a/docling/cli/main.py b/docling/cli/main.py index 1e2ce6a2..6ba0d616 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -14,6 +14,7 @@ from docling_core.types.doc import ImageRefMode from docling_core.utils.file import resolve_source_to_path from pydantic import TypeAdapter +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend @@ -412,8 +413,9 @@ def convert( if artifacts_path is not None: pipeline_options.artifacts_path = artifacts_path + backend: Type[PdfDocumentBackend] if pdf_backend == PdfBackend.DLPARSE_V1: - backend = DoclingParseV2DocumentBackend + backend = DoclingParseDocumentBackend elif pdf_backend == PdfBackend.DLPARSE_V2: backend = DoclingParseV2DocumentBackend elif pdf_backend == PdfBackend.DLPARSE_V4: diff --git a/docling/document_converter.py b/docling/document_converter.py index d52efcea..445b77a0 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.csv_backend import CsvDocumentBackend -from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.json.docling_json_backend import DoclingJSONBackend from docling.backend.md_backend import MarkdownDocumentBackend @@ -109,12 +109,12 @@ class XMLJatsFormatOption(FormatOption): class ImageFormatOption(FormatOption): pipeline_cls: Type = StandardPdfPipeline - backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend + backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend class PdfFormatOption(FormatOption): pipeline_cls: Type = StandardPdfPipeline - backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend + backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend def _get_default_option(format: InputFormat) -> FormatOption: @@ -147,10 +147,10 @@ def _get_default_option(format: InputFormat) -> FormatOption: pipeline_cls=SimplePipeline, backend=JatsDocumentBackend ), InputFormat.IMAGE: FormatOption( - pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend + pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend ), InputFormat.PDF: FormatOption( - pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend + pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend ), InputFormat.JSON_DOCLING: FormatOption( pipeline_cls=SimplePipeline, backend=DoclingJSONBackend diff --git a/tests/test_code_formula.py b/tests/test_code_formula.py index a607c09d..3263bcd9 100644 --- a/tests/test_code_formula.py +++ b/tests/test_code_formula.py @@ -3,7 +3,7 @@ from pathlib import Path from docling_core.types.doc import CodeItem, TextItem from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel -from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docling.backend.docling_parse_v2_backend import DoclingParseV4DocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions @@ -24,7 +24,6 @@ def get_converter(): converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( - backend=DoclingParseV2DocumentBackend, pipeline_cls=StandardPdfPipeline, pipeline_options=pipeline_options, ) diff --git a/tests/test_document_picture_classifier.py b/tests/test_document_picture_classifier.py index 6ca54d63..1c10ef3c 100644 --- a/tests/test_document_picture_classifier.py +++ b/tests/test_document_picture_classifier.py @@ -2,7 +2,7 @@ from pathlib import Path from docling_core.types.doc import PictureClassificationData -from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docling.backend.docling_parse_v2_backend import DoclingParseV4DocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions @@ -26,7 +26,6 @@ def get_converter(): converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( - backend=DoclingParseV2DocumentBackend, pipeline_cls=StandardPdfPipeline, pipeline_options=pipeline_options, ) diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 9ddf6758..0801d460 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -34,7 +34,7 @@ def get_converter(): converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( - pipeline_options=pipeline_options, backend=DoclingParseV4DocumentBackend + pipeline_options=pipeline_options, ) } ) diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 34b4261f..ac805ce6 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -45,7 +45,6 @@ def get_converter(ocr_options: OcrOptions): format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, - backend=DoclingParseV4DocumentBackend, ) } ) diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index e89fbfd4..e38bf582 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -31,7 +31,7 @@ def converter(): converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( - pipeline_options=pipeline_options, backend=DoclingParseV4DocumentBackend + pipeline_options=pipeline_options, ) } ) diff --git a/tests/test_options.py b/tests/test_options.py index 3e8a0d54..7d16210e 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -3,7 +3,10 @@ from pathlib import Path import pytest +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( @@ -33,7 +36,6 @@ def get_converters_with_table_options(): format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, - backend=DoclingParseV4DocumentBackend, ) } ) @@ -137,3 +139,29 @@ def test_ocr_coverage_threshold(test_doc_path): # this should have generated no results, since we set a very high threshold assert len(doc_result.document.texts) == 0 + + +def test_parser_backends(test_doc_path): + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = False + pipeline_options.do_table_structure = False + + for backend_t in [ + DoclingParseV4DocumentBackend, + DoclingParseV2DocumentBackend, + DoclingParseDocumentBackend, + PyPdfiumDocumentBackend, + ]: + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + backend=backend_t, + ) + } + ) + + test_doc_path = Path("./tests/data/pdf/code_and_formula.pdf") + doc_result: ConversionResult = converter.convert(test_doc_path) + + assert doc_result.status == ConversionStatus.SUCCESS