feat: add the Image backend (#2627)

* feat: add the Image backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the pre-commit

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* Fixed single- versus multi-frame image formats

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fix: Proper usage of ImageDocumentBackend in the pipeline, deprecate old code.

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Adapt tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: correct mets_gbs backend test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Make ImagePageBackend.get_bitmap_rects() yield

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-11-17 11:37:22 +01:00
committed by GitHub
parent ae30373ee7
commit 3495b73de8
12 changed files with 494 additions and 82 deletions

View File

@@ -4,9 +4,6 @@ from pathlib import Path
import pytest
from pydantic import ValidationError
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.backend_options import (
@@ -17,7 +14,7 @@ from docling.datamodel.backend_options import (
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput
from docling.datamodel.settings import DocumentLimits
from docling.document_converter import PdfFormatOption
from docling.document_converter import ImageFormatOption, PdfFormatOption
def test_in_doc_from_valid_path():
@@ -51,36 +48,6 @@ def test_in_doc_from_invalid_buf():
assert doc.valid is False
def test_image_in_pdf_backend():
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=PyPdfiumDocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseDocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseV2DocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseV4DocumentBackend,
)
assert in_doc.valid
def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
limits = DocumentLimits()
@@ -297,7 +264,7 @@ def test_tiff_two_pages():
doc = InputDocument(
path_or_stream=tiff_path,
format=InputFormat.IMAGE,
backend=PdfFormatOption().backend, # use default backend
backend=ImageFormatOption().backend, # use default backend
)
assert doc.valid is True
assert doc.page_count == 2