feat: add the Image backend (#2627)

* feat: add the Image backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the pre-commit

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* Fixed single- versus multi-frame image formats

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fix: Proper usage of ImageDocumentBackend in the pipeline, deprecate old code.

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Adapt tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: correct mets_gbs backend test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Make ImagePageBackend.get_bitmap_rects() yield

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-11-17 11:37:22 +01:00
committed by GitHub
parent ae30373ee7
commit 3495b73de8
12 changed files with 494 additions and 82 deletions

View File

@@ -0,0 +1,188 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from PIL import Image
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.backend_options import PdfBackendOptions
from docling.datamodel.base_models import InputFormat, Size
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class _ImagePageBackend(PdfPageBackend):
def __init__(self, image: Image.Image):
self._image: Optional[Image.Image] = image
self.valid: bool = self._image is not None
def is_valid(self) -> bool:
return self.valid
def get_text_in_rect(self, bbox: BoundingBox) -> str:
# No text extraction from raw images without OCR
return ""
def get_segmented_page(self) -> SegmentedPdfPage:
# Return empty segmented page with proper dimensions for raw images
assert self._image is not None
page_size = self.get_size()
bbox = BoundingBox(
l=0.0,
t=0.0,
r=float(page_size.width),
b=float(page_size.height),
coord_origin=CoordOrigin.BOTTOMLEFT,
)
dimension = PdfPageGeometry(
angle=0.0,
rect=BoundingRectangle.from_bounding_box(bbox),
boundary_type=PdfPageBoundaryType.CROP_BOX,
art_bbox=bbox,
bleed_bbox=bbox,
crop_bbox=bbox,
media_bbox=bbox,
trim_bbox=bbox,
)
return SegmentedPdfPage(
dimension=dimension,
char_cells=[],
word_cells=[],
textline_cells=[],
has_chars=False,
has_words=False,
has_lines=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
# No text cells on raw images
return []
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
# For raw images, the entire page is a bitmap
assert self._image is not None
page_size = self.get_size()
full_page_bbox = BoundingBox(
l=0.0,
t=0.0,
r=float(page_size.width),
b=float(page_size.height),
coord_origin=CoordOrigin.TOPLEFT,
)
if scale != 1:
full_page_bbox = full_page_bbox.scaled(scale=scale)
yield full_page_bbox
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
assert self._image is not None
img = self._image
if cropbox is not None:
# Expected cropbox comes in TOPLEFT coords in our pipeline
if cropbox.coord_origin != CoordOrigin.TOPLEFT:
# Convert to TOPLEFT relative to current image height
cropbox = cropbox.to_top_left_origin(img.height)
left, top, right, bottom = cropbox.as_tuple()
left = max(0, round(left))
top = max(0, round(top))
right = min(img.width, round(right))
bottom = min(img.height, round(bottom))
img = img.crop((left, top, right, bottom))
if scale != 1:
new_w = max(1, round(img.width * scale))
new_h = max(1, round(img.height * scale))
img = img.resize((new_w, new_h))
return img
def get_size(self) -> Size:
assert self._image is not None
return Size(width=self._image.width, height=self._image.height)
def unload(self):
# Help GC and free memory
self._image = None
class ImageDocumentBackend(PdfDocumentBackend):
"""Image-native backend that bypasses pypdfium2.
Notes:
- Subclasses PdfDocumentBackend to satisfy pipeline type checks.
- Intentionally avoids calling PdfDocumentBackend.__init__ to skip
the image→PDF conversion and any pypdfium2 usage.
- Handles multi-page TIFF by extracting frames eagerly to separate
Image objects to keep thread-safety when pages process in parallel.
"""
def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
options: PdfBackendOptions = PdfBackendOptions(),
):
# Bypass PdfDocumentBackend.__init__ to avoid image→PDF conversion
AbstractDocumentBackend.__init__(self, in_doc, path_or_stream, options)
self.options: PdfBackendOptions = options
if self.input_format not in {InputFormat.IMAGE}:
raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to ImageDocumentBackend."
)
# Load frames eagerly for thread-safety across pages
self._frames: List[Image.Image] = []
try:
img = Image.open(self.path_or_stream) # type: ignore[arg-type]
# Handle multi-frame and single-frame images
# - multiframe formats: TIFF, GIF, ICO
# - singleframe formats: JPEG (.jpg, .jpeg), PNG (.png), BMP, WEBP (unless animated), HEIC
frame_count = getattr(img, "n_frames", 1)
if frame_count > 1:
for i in range(frame_count):
img.seek(i)
self._frames.append(img.copy().convert("RGB"))
else:
self._frames.append(img.convert("RGB"))
except Exception as e:
raise RuntimeError(f"Could not load image for document {self.file}") from e
def is_valid(self) -> bool:
return len(self._frames) > 0
def page_count(self) -> int:
return len(self._frames)
def load_page(self, page_no: int) -> _ImagePageBackend:
if not (0 <= page_no < len(self._frames)):
raise IndexError(f"Page index out of range: {page_no}")
return _ImagePageBackend(self._frames[page_no])
@classmethod
def supported_formats(cls) -> set[InputFormat]:
# Only IMAGE here; PDF handling remains in PDF-oriented backends
return {InputFormat.IMAGE}
@classmethod
def supports_pagination(cls) -> bool:
return True
def unload(self):
super().unload()
self._frames = []

View File

@@ -60,38 +60,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
super().__init__(in_doc, path_or_stream, options)
self.options: PdfBackendOptions
if self.input_format is not InputFormat.PDF:
if self.input_format is InputFormat.IMAGE:
buf = BytesIO()
img = Image.open(self.path_or_stream)
# Handle multi-page TIFF images
if hasattr(img, "n_frames") and img.n_frames > 1:
# Extract all frames from multi-page image
frames = []
try:
for i in range(img.n_frames):
img.seek(i)
frame = img.copy().convert("RGB")
frames.append(frame)
except EOFError:
pass
# Save as multi-page PDF
if frames:
frames[0].save(
buf, "PDF", save_all=True, append_images=frames[1:]
)
else:
# Fallback to single page if frame extraction fails
img.convert("RGB").save(buf, "PDF")
else:
# Single page image - convert to RGB and save
img.convert("RGB").save(buf, "PDF")
buf.seek(0)
self.path_or_stream = buf
elif self.input_format not in self.supported_formats():
if self.input_format not in self.supported_formats():
raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
)
@@ -106,7 +75,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PDF, InputFormat.IMAGE}
return {InputFormat.PDF}
@classmethod
def supports_pagination(cls) -> bool:

View File

@@ -26,6 +26,7 @@ from rich.console import Console
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.image_backend import ImageDocumentBackend
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -698,9 +699,16 @@ def convert( # noqa: C901
if artifacts_path is not None:
simple_format_option.artifacts_path = artifacts_path
# Use image-native backend for IMAGE to avoid pypdfium2 locking
image_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=ImageDocumentBackend,
backend_options=pdf_backend_options,
)
format_options = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
InputFormat.IMAGE: image_format_option,
InputFormat.METS_GBS: mets_gbs_format_option,
InputFormat.DOCX: WordFormatOption(
pipeline_options=simple_format_option

View File

@@ -164,6 +164,7 @@ class DoclingComponentType(str, Enum):
MODEL = "model"
DOC_ASSEMBLER = "doc_assembler"
USER_INPUT = "user_input"
PIPELINE = "pipeline"
class VlmStopReason(str, Enum):

View File

@@ -3,6 +3,7 @@ import logging
import sys
import threading
import time
import warnings
from collections.abc import Iterable, Iterator
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
@@ -21,6 +22,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.image_backend import ImageDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
@@ -129,7 +131,7 @@ class XMLJatsFormatOption(FormatOption):
class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
backend: Type[AbstractDocumentBackend] = ImageDocumentBackend
class PdfFormatOption(FormatOption):
@@ -184,10 +186,35 @@ class DocumentConverter:
self.allowed_formats = (
allowed_formats if allowed_formats is not None else list(InputFormat)
)
# Normalize format options: ensure IMAGE format uses ImageDocumentBackend
# for backward compatibility (old code might use PdfFormatOption or other backends for images)
normalized_format_options: dict[InputFormat, FormatOption] = {}
if format_options:
for format, option in format_options.items():
if (
format == InputFormat.IMAGE
and option.backend is not ImageDocumentBackend
):
warnings.warn(
f"Using {option.backend.__name__} for InputFormat.IMAGE is deprecated. "
"Images should use ImageDocumentBackend via ImageFormatOption. "
"Automatically correcting the backend, please update your code to avoid this warning.",
DeprecationWarning,
stacklevel=2,
)
# Convert to ImageFormatOption while preserving pipeline and backend options
normalized_format_options[format] = ImageFormatOption(
pipeline_options=option.pipeline_options,
backend_options=option.backend_options,
)
else:
normalized_format_options[format] = option
self.format_to_options: dict[InputFormat, FormatOption] = {
format: (
_get_default_option(format=format)
if (custom_option := (format_options or {}).get(format)) is None
if (custom_option := normalized_format_options.get(format)) is None
else custom_option
)
for format in self.allowed_formats
@@ -263,8 +290,12 @@ class DocumentConverter:
ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS,
}:
error_details = ""
if conv_res.errors:
error_messages = [err.error_message for err in conv_res.errors]
error_details = f" Errors: {'; '.join(error_messages)}"
raise ConversionError(
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}.{error_details}"
)
else:
yield conv_res

View File

@@ -14,6 +14,7 @@ from pydantic import ConfigDict, model_validator, validate_call
from typing_extensions import Self
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.image_backend import ImageDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
BaseFormatOption,
@@ -72,7 +73,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
the format registry between convert/extract.
"""
format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.IMAGE: PyPdfiumDocumentBackend,
InputFormat.IMAGE: ImageDocumentBackend,
InputFormat.PDF: PyPdfiumDocumentBackend,
}

View File

@@ -76,8 +76,15 @@ class BasePipeline(ABC):
conv_res.status = self._determine_status(conv_res)
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
if raises_on_error:
raise e
if not raises_on_error:
error_item = ErrorItem(
component_type=DoclingComponentType.PIPELINE,
module_name=self.__class__.__name__,
error_message=str(e),
)
conv_res.errors.append(error_item)
else:
raise RuntimeError(f"Pipeline {self.__class__.__name__} failed") from e
finally:
self._unload(conv_res)

View File

@@ -30,7 +30,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
from docling.datamodel.settings import settings
@@ -265,7 +271,9 @@ class ThreadedPipelineStage:
)
)
except Exception as exc:
_log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
_log.error(
"Stage %s failed for run %d: %s", self.name, rid, exc, exc_info=True
)
for it in items:
it.is_failed = True
it.error = exc
@@ -598,6 +606,16 @@ class StandardPdfPipeline(ConvertPipeline):
if p.page_no in page_map
or not any(fp == p.page_no for fp, _ in proc.failed_pages)
]
# Add error details from failed pages
for page_no, error in proc.failed_pages:
page_label = f"Page {page_no + 1}" if page_no >= 0 else "Unknown page"
error_msg = str(error) if error else ""
error_item = ErrorItem(
component_type=DoclingComponentType.PIPELINE,
module_name=self.__class__.__name__,
error_message=f"{page_label}: {error_msg}" if error_msg else page_label,
)
conv_res.errors.append(error_item)
if proc.is_complete_failure:
conv_res.status = ConversionStatus.FAILURE
elif proc.is_partial_success:

View File

@@ -0,0 +1,218 @@
from io import BytesIO
from pathlib import Path
import pytest
from docling_core.types.doc import BoundingBox, CoordOrigin
from PIL import Image
from docling.backend.image_backend import ImageDocumentBackend, _ImagePageBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput
from docling.document_converter import DocumentConverter, ImageFormatOption
from docling.document_extractor import DocumentExtractor
def _make_png_stream(
width: int = 64, height: int = 48, color=(123, 45, 67)
) -> DocumentStream:
img = Image.new("RGB", (width, height), color)
buf = BytesIO()
img.save(buf, format="PNG")
buf.seek(0)
return DocumentStream(name="test.png", stream=buf)
def _make_multipage_tiff_stream(num_pages: int = 3, size=(32, 32)) -> DocumentStream:
frames = [
Image.new("RGB", size, (i * 10 % 255, i * 20 % 255, i * 30 % 255))
for i in range(num_pages)
]
buf = BytesIO()
frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:])
buf.seek(0)
return DocumentStream(name="test.tiff", stream=buf)
def test_docs_builder_uses_image_backend_for_image_stream():
stream = _make_png_stream()
conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream])
# Provide format options mapping that includes IMAGE -> ImageFormatOption (which carries ImageDocumentBackend)
format_options = {InputFormat.IMAGE: ImageFormatOption()}
docs = list(conv_input.docs(format_options))
assert len(docs) == 1
in_doc = docs[0]
assert in_doc.format == InputFormat.IMAGE
assert isinstance(in_doc._backend, ImageDocumentBackend)
assert in_doc.page_count == 1
def test_docs_builder_multipage_tiff_counts_frames():
stream = _make_multipage_tiff_stream(num_pages=4)
conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream])
format_options = {InputFormat.IMAGE: ImageFormatOption()}
in_doc = next(conv_input.docs(format_options))
assert isinstance(in_doc._backend, ImageDocumentBackend)
assert in_doc.page_count == 4
def test_converter_default_maps_image_to_image_backend():
converter = DocumentConverter(allowed_formats=[InputFormat.IMAGE])
backend_cls = converter.format_to_options[InputFormat.IMAGE].backend
assert backend_cls is ImageDocumentBackend
def test_extractor_default_maps_image_to_image_backend():
extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE])
backend_cls = extractor.extraction_format_to_options[InputFormat.IMAGE].backend
assert backend_cls is ImageDocumentBackend
def _get_backend_from_stream(stream: DocumentStream):
"""Helper to create InputDocument with ImageDocumentBackend from a stream."""
in_doc = InputDocument(
path_or_stream=stream.stream,
format=InputFormat.IMAGE,
backend=ImageDocumentBackend,
filename=stream.name,
)
return in_doc._backend
def test_num_pages_single():
"""Test page count for single-page image."""
stream = _make_png_stream(width=100, height=80)
doc_backend = _get_backend_from_stream(stream)
assert doc_backend.page_count() == 1
def test_num_pages_multipage():
"""Test page count for multi-page TIFF."""
stream = _make_multipage_tiff_stream(num_pages=5, size=(64, 64))
doc_backend = _get_backend_from_stream(stream)
assert doc_backend.page_count() == 5
def test_get_size():
"""Test getting page size."""
width, height = 120, 90
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
size = page_backend.get_size()
assert size.width == width
assert size.height == height
def test_get_page_image_full():
"""Test getting full page image."""
width, height = 100, 80
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
img = page_backend.get_page_image()
assert img.width == width
assert img.height == height
def test_get_page_image_scaled():
"""Test getting scaled page image."""
width, height = 100, 80
scale = 2.0
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
img = page_backend.get_page_image(scale=scale)
assert img.width == round(width * scale)
assert img.height == round(height * scale)
def test_crop_page_image():
"""Test cropping page image."""
width, height = 200, 150
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
# Crop a region from the center
cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT)
img = page_backend.get_page_image(cropbox=cropbox)
assert img.width == 100 # 150 - 50
assert img.height == 90 # 120 - 30
def test_crop_page_image_scaled():
"""Test cropping and scaling page image."""
width, height = 200, 150
scale = 0.5
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT)
img = page_backend.get_page_image(scale=scale, cropbox=cropbox)
assert img.width == round(100 * scale) # cropped width * scale
assert img.height == round(90 * scale) # cropped height * scale
def test_get_bitmap_rects():
"""Test getting bitmap rects - should return full page rectangle."""
width, height = 100, 80
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
rects = list(page_backend.get_bitmap_rects())
assert len(rects) == 1
bbox = rects[0]
assert bbox.l == 0.0
assert bbox.t == 0.0
assert bbox.r == float(width)
assert bbox.b == float(height)
assert bbox.coord_origin == CoordOrigin.TOPLEFT
def test_get_bitmap_rects_scaled():
"""Test getting bitmap rects with scaling."""
width, height = 100, 80
scale = 2.0
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
rects = list(page_backend.get_bitmap_rects(scale=scale))
assert len(rects) == 1
bbox = rects[0]
assert bbox.l == 0.0
assert bbox.t == 0.0
assert bbox.r == float(width * scale)
assert bbox.b == float(height * scale)
assert bbox.coord_origin == CoordOrigin.TOPLEFT
def test_get_text_in_rect():
"""Test that get_text_in_rect returns empty string for images (no OCR)."""
stream = _make_png_stream()
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
bbox = BoundingBox(l=10, t=10, r=50, b=50, coord_origin=CoordOrigin.TOPLEFT)
text = page_backend.get_text_in_rect(bbox)
assert text == ""
def test_multipage_access():
"""Test accessing different pages in multi-page image."""
num_pages = 4
stream = _make_multipage_tiff_stream(num_pages=num_pages, size=(64, 64))
doc_backend = _get_backend_from_stream(stream)
assert doc_backend.page_count() == num_pages
# Access each page
for i in range(num_pages):
page_backend = doc_backend.load_page(i)
assert page_backend.is_valid()
size = page_backend.get_size()
assert size.width == 64
assert size.height == 64

View File

@@ -15,7 +15,7 @@ def test_doc_path():
def _get_backend(pdf_doc):
in_doc = InputDocument(
path_or_stream=pdf_doc,
format=InputFormat.PDF,
format=InputFormat.METS_GBS,
backend=MetsGbsDocumentBackend,
)

View File

@@ -2,6 +2,8 @@ import sys
from pathlib import Path
from typing import List
from pydantic.type_adapter import R
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.datamodel.pipeline_options import (
@@ -72,7 +74,9 @@ def test_e2e_webp_conversions():
for webp_path in webp_paths:
print(f"converting {webp_path}")
doc_result: ConversionResult = converter.convert(webp_path)
doc_result: ConversionResult = converter.convert(
webp_path, raises_on_error=True
)
verify_conversion_result_v2(
input_path=webp_path,

View File

@@ -4,9 +4,6 @@ from pathlib import Path
import pytest
from pydantic import ValidationError
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.backend_options import (
@@ -17,7 +14,7 @@ from docling.datamodel.backend_options import (
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput
from docling.datamodel.settings import DocumentLimits
from docling.document_converter import PdfFormatOption
from docling.document_converter import ImageFormatOption, PdfFormatOption
def test_in_doc_from_valid_path():
@@ -51,36 +48,6 @@ def test_in_doc_from_invalid_buf():
assert doc.valid is False
def test_image_in_pdf_backend():
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=PyPdfiumDocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseDocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseV2DocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseV4DocumentBackend,
)
assert in_doc.valid
def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
limits = DocumentLimits()
@@ -297,7 +264,7 @@ def test_tiff_two_pages():
doc = InputDocument(
path_or_stream=tiff_path,
format=InputFormat.IMAGE,
backend=PdfFormatOption().backend, # use default backend
backend=ImageFormatOption().backend, # use default backend
)
assert doc.valid is True
assert doc.page_count == 2