diff --git a/docling/backend/image_backend.py b/docling/backend/image_backend.py new file mode 100644 index 00000000..3bc30014 --- /dev/null +++ b/docling/backend/image_backend.py @@ -0,0 +1,188 @@ +import logging +from io import BytesIO +from pathlib import Path +from typing import Iterable, List, Optional, Union + +from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import ( + BoundingRectangle, + PdfPageBoundaryType, + PdfPageGeometry, + SegmentedPdfPage, + TextCell, +) +from PIL import Image + +from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docling.datamodel.backend_options import PdfBackendOptions +from docling.datamodel.base_models import InputFormat, Size +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class _ImagePageBackend(PdfPageBackend): + def __init__(self, image: Image.Image): + self._image: Optional[Image.Image] = image + self.valid: bool = self._image is not None + + def is_valid(self) -> bool: + return self.valid + + def get_text_in_rect(self, bbox: BoundingBox) -> str: + # No text extraction from raw images without OCR + return "" + + def get_segmented_page(self) -> SegmentedPdfPage: + # Return empty segmented page with proper dimensions for raw images + assert self._image is not None + page_size = self.get_size() + bbox = BoundingBox( + l=0.0, + t=0.0, + r=float(page_size.width), + b=float(page_size.height), + coord_origin=CoordOrigin.BOTTOMLEFT, + ) + dimension = PdfPageGeometry( + angle=0.0, + rect=BoundingRectangle.from_bounding_box(bbox), + boundary_type=PdfPageBoundaryType.CROP_BOX, + art_bbox=bbox, + bleed_bbox=bbox, + crop_bbox=bbox, + media_bbox=bbox, + trim_bbox=bbox, + ) + return SegmentedPdfPage( + dimension=dimension, + char_cells=[], + word_cells=[], + textline_cells=[], + has_chars=False, + has_words=False, + has_lines=False, + ) + + def get_text_cells(self) -> Iterable[TextCell]: + # No text cells on raw images + return [] + + def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: + # For raw images, the entire page is a bitmap + assert self._image is not None + page_size = self.get_size() + full_page_bbox = BoundingBox( + l=0.0, + t=0.0, + r=float(page_size.width), + b=float(page_size.height), + coord_origin=CoordOrigin.TOPLEFT, + ) + if scale != 1: + full_page_bbox = full_page_bbox.scaled(scale=scale) + yield full_page_bbox + + def get_page_image( + self, scale: float = 1, cropbox: Optional[BoundingBox] = None + ) -> Image.Image: + assert self._image is not None + img = self._image + + if cropbox is not None: + # Expected cropbox comes in TOPLEFT coords in our pipeline + if cropbox.coord_origin != CoordOrigin.TOPLEFT: + # Convert to TOPLEFT relative to current image height + cropbox = cropbox.to_top_left_origin(img.height) + left, top, right, bottom = cropbox.as_tuple() + left = max(0, round(left)) + top = max(0, round(top)) + right = min(img.width, round(right)) + bottom = min(img.height, round(bottom)) + img = img.crop((left, top, right, bottom)) + + if scale != 1: + new_w = max(1, round(img.width * scale)) + new_h = max(1, round(img.height * scale)) + img = img.resize((new_w, new_h)) + + return img + + def get_size(self) -> Size: + assert self._image is not None + return Size(width=self._image.width, height=self._image.height) + + def unload(self): + # Help GC and free memory + self._image = None + + +class ImageDocumentBackend(PdfDocumentBackend): + """Image-native backend that bypasses pypdfium2. + + Notes: + - Subclasses PdfDocumentBackend to satisfy pipeline type checks. + - Intentionally avoids calling PdfDocumentBackend.__init__ to skip + the image→PDF conversion and any pypdfium2 usage. + - Handles multi-page TIFF by extracting frames eagerly to separate + Image objects to keep thread-safety when pages process in parallel. + """ + + def __init__( + self, + in_doc: InputDocument, + path_or_stream: Union[BytesIO, Path], + options: PdfBackendOptions = PdfBackendOptions(), + ): + # Bypass PdfDocumentBackend.__init__ to avoid image→PDF conversion + AbstractDocumentBackend.__init__(self, in_doc, path_or_stream, options) + self.options: PdfBackendOptions = options + + if self.input_format not in {InputFormat.IMAGE}: + raise RuntimeError( + f"Incompatible file format {self.input_format} was passed to ImageDocumentBackend." + ) + + # Load frames eagerly for thread-safety across pages + self._frames: List[Image.Image] = [] + try: + img = Image.open(self.path_or_stream) # type: ignore[arg-type] + + # Handle multi-frame and single-frame images + # - multiframe formats: TIFF, GIF, ICO + # - singleframe formats: JPEG (.jpg, .jpeg), PNG (.png), BMP, WEBP (unless animated), HEIC + frame_count = getattr(img, "n_frames", 1) + + if frame_count > 1: + for i in range(frame_count): + img.seek(i) + self._frames.append(img.copy().convert("RGB")) + else: + self._frames.append(img.convert("RGB")) + except Exception as e: + raise RuntimeError(f"Could not load image for document {self.file}") from e + + def is_valid(self) -> bool: + return len(self._frames) > 0 + + def page_count(self) -> int: + return len(self._frames) + + def load_page(self, page_no: int) -> _ImagePageBackend: + if not (0 <= page_no < len(self._frames)): + raise IndexError(f"Page index out of range: {page_no}") + return _ImagePageBackend(self._frames[page_no]) + + @classmethod + def supported_formats(cls) -> set[InputFormat]: + # Only IMAGE here; PDF handling remains in PDF-oriented backends + return {InputFormat.IMAGE} + + @classmethod + def supports_pagination(cls) -> bool: + return True + + def unload(self): + super().unload() + self._frames = [] diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py index 71f89f17..cb45847e 100644 --- a/docling/backend/pdf_backend.py +++ b/docling/backend/pdf_backend.py @@ -60,41 +60,10 @@ class PdfDocumentBackend(PaginatedDocumentBackend): super().__init__(in_doc, path_or_stream, options) self.options: PdfBackendOptions - if self.input_format is not InputFormat.PDF: - if self.input_format is InputFormat.IMAGE: - buf = BytesIO() - img = Image.open(self.path_or_stream) - - # Handle multi-page TIFF images - if hasattr(img, "n_frames") and img.n_frames > 1: - # Extract all frames from multi-page image - frames = [] - try: - for i in range(img.n_frames): - img.seek(i) - frame = img.copy().convert("RGB") - frames.append(frame) - except EOFError: - pass - - # Save as multi-page PDF - if frames: - frames[0].save( - buf, "PDF", save_all=True, append_images=frames[1:] - ) - else: - # Fallback to single page if frame extraction fails - img.convert("RGB").save(buf, "PDF") - else: - # Single page image - convert to RGB and save - img.convert("RGB").save(buf, "PDF") - - buf.seek(0) - self.path_or_stream = buf - elif self.input_format not in self.supported_formats(): - raise RuntimeError( - f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}." - ) + if self.input_format not in self.supported_formats(): + raise RuntimeError( + f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}." + ) @abstractmethod def load_page(self, page_no: int) -> PdfPageBackend: @@ -106,7 +75,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend): @classmethod def supported_formats(cls) -> Set[InputFormat]: - return {InputFormat.PDF, InputFormat.IMAGE} + return {InputFormat.PDF} @classmethod def supports_pagination(cls) -> bool: diff --git a/docling/cli/main.py b/docling/cli/main.py index 419c3a42..9607d2f4 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -26,6 +26,7 @@ from rich.console import Console from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend +from docling.backend.image_backend import ImageDocumentBackend from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend @@ -698,9 +699,16 @@ def convert( # noqa: C901 if artifacts_path is not None: simple_format_option.artifacts_path = artifacts_path + # Use image-native backend for IMAGE to avoid pypdfium2 locking + image_format_option = PdfFormatOption( + pipeline_options=pipeline_options, + backend=ImageDocumentBackend, + backend_options=pdf_backend_options, + ) + format_options = { InputFormat.PDF: pdf_format_option, - InputFormat.IMAGE: pdf_format_option, + InputFormat.IMAGE: image_format_option, InputFormat.METS_GBS: mets_gbs_format_option, InputFormat.DOCX: WordFormatOption( pipeline_options=simple_format_option diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 411df4ca..3b50589b 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -164,6 +164,7 @@ class DoclingComponentType(str, Enum): MODEL = "model" DOC_ASSEMBLER = "doc_assembler" USER_INPUT = "user_input" + PIPELINE = "pipeline" class VlmStopReason(str, Enum): diff --git a/docling/document_converter.py b/docling/document_converter.py index 834309a9..d6369529 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -3,6 +3,7 @@ import logging import sys import threading import time +import warnings from collections.abc import Iterable, Iterator from concurrent.futures import ThreadPoolExecutor from datetime import datetime @@ -21,6 +22,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.csv_backend import CsvDocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.html_backend import HTMLDocumentBackend +from docling.backend.image_backend import ImageDocumentBackend from docling.backend.json.docling_json_backend import DoclingJSONBackend from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend @@ -129,7 +131,7 @@ class XMLJatsFormatOption(FormatOption): class ImageFormatOption(FormatOption): pipeline_cls: Type = StandardPdfPipeline - backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend + backend: Type[AbstractDocumentBackend] = ImageDocumentBackend class PdfFormatOption(FormatOption): @@ -184,10 +186,35 @@ class DocumentConverter: self.allowed_formats = ( allowed_formats if allowed_formats is not None else list(InputFormat) ) + + # Normalize format options: ensure IMAGE format uses ImageDocumentBackend + # for backward compatibility (old code might use PdfFormatOption or other backends for images) + normalized_format_options: dict[InputFormat, FormatOption] = {} + if format_options: + for format, option in format_options.items(): + if ( + format == InputFormat.IMAGE + and option.backend is not ImageDocumentBackend + ): + warnings.warn( + f"Using {option.backend.__name__} for InputFormat.IMAGE is deprecated. " + "Images should use ImageDocumentBackend via ImageFormatOption. " + "Automatically correcting the backend, please update your code to avoid this warning.", + DeprecationWarning, + stacklevel=2, + ) + # Convert to ImageFormatOption while preserving pipeline and backend options + normalized_format_options[format] = ImageFormatOption( + pipeline_options=option.pipeline_options, + backend_options=option.backend_options, + ) + else: + normalized_format_options[format] = option + self.format_to_options: dict[InputFormat, FormatOption] = { format: ( _get_default_option(format=format) - if (custom_option := (format_options or {}).get(format)) is None + if (custom_option := normalized_format_options.get(format)) is None else custom_option ) for format in self.allowed_formats @@ -263,8 +290,12 @@ class DocumentConverter: ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS, }: + error_details = "" + if conv_res.errors: + error_messages = [err.error_message for err in conv_res.errors] + error_details = f" Errors: {'; '.join(error_messages)}" raise ConversionError( - f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}" + f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}.{error_details}" ) else: yield conv_res diff --git a/docling/document_extractor.py b/docling/document_extractor.py index 8ae4a6e6..ae66f9e4 100644 --- a/docling/document_extractor.py +++ b/docling/document_extractor.py @@ -14,6 +14,7 @@ from pydantic import ConfigDict, model_validator, validate_call from typing_extensions import Self from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.backend.image_backend import ImageDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ( BaseFormatOption, @@ -72,7 +73,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption: the format registry between convert/extract. """ format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = { - InputFormat.IMAGE: PyPdfiumDocumentBackend, + InputFormat.IMAGE: ImageDocumentBackend, InputFormat.PDF: PyPdfiumDocumentBackend, } diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 0c35d24c..1ee9ab89 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -76,8 +76,15 @@ class BasePipeline(ABC): conv_res.status = self._determine_status(conv_res) except Exception as e: conv_res.status = ConversionStatus.FAILURE - if raises_on_error: - raise e + if not raises_on_error: + error_item = ErrorItem( + component_type=DoclingComponentType.PIPELINE, + module_name=self.__class__.__name__, + error_message=str(e), + ) + conv_res.errors.append(error_item) + else: + raise RuntimeError(f"Pipeline {self.__class__.__name__} failed") from e finally: self._unload(conv_res) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 82bf012f..6c662c0e 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -30,7 +30,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend -from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page +from docling.datamodel.base_models import ( + AssembledUnit, + ConversionStatus, + DoclingComponentType, + ErrorItem, + Page, +) from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions from docling.datamodel.settings import settings @@ -265,7 +271,9 @@ class ThreadedPipelineStage: ) ) except Exception as exc: - _log.error("Stage %s failed for run %d: %s", self.name, rid, exc) + _log.error( + "Stage %s failed for run %d: %s", self.name, rid, exc, exc_info=True + ) for it in items: it.is_failed = True it.error = exc @@ -598,6 +606,16 @@ class StandardPdfPipeline(ConvertPipeline): if p.page_no in page_map or not any(fp == p.page_no for fp, _ in proc.failed_pages) ] + # Add error details from failed pages + for page_no, error in proc.failed_pages: + page_label = f"Page {page_no + 1}" if page_no >= 0 else "Unknown page" + error_msg = str(error) if error else "" + error_item = ErrorItem( + component_type=DoclingComponentType.PIPELINE, + module_name=self.__class__.__name__, + error_message=f"{page_label}: {error_msg}" if error_msg else page_label, + ) + conv_res.errors.append(error_item) if proc.is_complete_failure: conv_res.status = ConversionStatus.FAILURE elif proc.is_partial_success: diff --git a/tests/test_backend_image_native.py b/tests/test_backend_image_native.py new file mode 100644 index 00000000..abfd769e --- /dev/null +++ b/tests/test_backend_image_native.py @@ -0,0 +1,218 @@ +from io import BytesIO +from pathlib import Path + +import pytest +from docling_core.types.doc import BoundingBox, CoordOrigin +from PIL import Image + +from docling.backend.image_backend import ImageDocumentBackend, _ImagePageBackend +from docling.datamodel.base_models import DocumentStream, InputFormat +from docling.datamodel.document import InputDocument, _DocumentConversionInput +from docling.document_converter import DocumentConverter, ImageFormatOption +from docling.document_extractor import DocumentExtractor + + +def _make_png_stream( + width: int = 64, height: int = 48, color=(123, 45, 67) +) -> DocumentStream: + img = Image.new("RGB", (width, height), color) + buf = BytesIO() + img.save(buf, format="PNG") + buf.seek(0) + return DocumentStream(name="test.png", stream=buf) + + +def _make_multipage_tiff_stream(num_pages: int = 3, size=(32, 32)) -> DocumentStream: + frames = [ + Image.new("RGB", size, (i * 10 % 255, i * 20 % 255, i * 30 % 255)) + for i in range(num_pages) + ] + buf = BytesIO() + frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:]) + buf.seek(0) + return DocumentStream(name="test.tiff", stream=buf) + + +def test_docs_builder_uses_image_backend_for_image_stream(): + stream = _make_png_stream() + conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream]) + # Provide format options mapping that includes IMAGE -> ImageFormatOption (which carries ImageDocumentBackend) + format_options = {InputFormat.IMAGE: ImageFormatOption()} + + docs = list(conv_input.docs(format_options)) + assert len(docs) == 1 + in_doc = docs[0] + assert in_doc.format == InputFormat.IMAGE + assert isinstance(in_doc._backend, ImageDocumentBackend) + assert in_doc.page_count == 1 + + +def test_docs_builder_multipage_tiff_counts_frames(): + stream = _make_multipage_tiff_stream(num_pages=4) + conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream]) + format_options = {InputFormat.IMAGE: ImageFormatOption()} + + in_doc = next(conv_input.docs(format_options)) + assert isinstance(in_doc._backend, ImageDocumentBackend) + assert in_doc.page_count == 4 + + +def test_converter_default_maps_image_to_image_backend(): + converter = DocumentConverter(allowed_formats=[InputFormat.IMAGE]) + backend_cls = converter.format_to_options[InputFormat.IMAGE].backend + assert backend_cls is ImageDocumentBackend + + +def test_extractor_default_maps_image_to_image_backend(): + extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE]) + backend_cls = extractor.extraction_format_to_options[InputFormat.IMAGE].backend + assert backend_cls is ImageDocumentBackend + + +def _get_backend_from_stream(stream: DocumentStream): + """Helper to create InputDocument with ImageDocumentBackend from a stream.""" + in_doc = InputDocument( + path_or_stream=stream.stream, + format=InputFormat.IMAGE, + backend=ImageDocumentBackend, + filename=stream.name, + ) + return in_doc._backend + + +def test_num_pages_single(): + """Test page count for single-page image.""" + stream = _make_png_stream(width=100, height=80) + doc_backend = _get_backend_from_stream(stream) + assert doc_backend.page_count() == 1 + + +def test_num_pages_multipage(): + """Test page count for multi-page TIFF.""" + stream = _make_multipage_tiff_stream(num_pages=5, size=(64, 64)) + doc_backend = _get_backend_from_stream(stream) + assert doc_backend.page_count() == 5 + + +def test_get_size(): + """Test getting page size.""" + width, height = 120, 90 + stream = _make_png_stream(width=width, height=height) + doc_backend = _get_backend_from_stream(stream) + page_backend: _ImagePageBackend = doc_backend.load_page(0) + size = page_backend.get_size() + assert size.width == width + assert size.height == height + + +def test_get_page_image_full(): + """Test getting full page image.""" + width, height = 100, 80 + stream = _make_png_stream(width=width, height=height) + doc_backend = _get_backend_from_stream(stream) + page_backend: _ImagePageBackend = doc_backend.load_page(0) + img = page_backend.get_page_image() + assert img.width == width + assert img.height == height + + +def test_get_page_image_scaled(): + """Test getting scaled page image.""" + width, height = 100, 80 + scale = 2.0 + stream = _make_png_stream(width=width, height=height) + doc_backend = _get_backend_from_stream(stream) + page_backend: _ImagePageBackend = doc_backend.load_page(0) + img = page_backend.get_page_image(scale=scale) + assert img.width == round(width * scale) + assert img.height == round(height * scale) + + +def test_crop_page_image(): + """Test cropping page image.""" + width, height = 200, 150 + stream = _make_png_stream(width=width, height=height) + doc_backend = _get_backend_from_stream(stream) + page_backend: _ImagePageBackend = doc_backend.load_page(0) + + # Crop a region from the center + cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT) + img = page_backend.get_page_image(cropbox=cropbox) + assert img.width == 100 # 150 - 50 + assert img.height == 90 # 120 - 30 + + +def test_crop_page_image_scaled(): + """Test cropping and scaling page image.""" + width, height = 200, 150 + scale = 0.5 + stream = _make_png_stream(width=width, height=height) + doc_backend = _get_backend_from_stream(stream) + page_backend: _ImagePageBackend = doc_backend.load_page(0) + + cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT) + img = page_backend.get_page_image(scale=scale, cropbox=cropbox) + assert img.width == round(100 * scale) # cropped width * scale + assert img.height == round(90 * scale) # cropped height * scale + + +def test_get_bitmap_rects(): + """Test getting bitmap rects - should return full page rectangle.""" + width, height = 100, 80 + stream = _make_png_stream(width=width, height=height) + doc_backend = _get_backend_from_stream(stream) + page_backend: _ImagePageBackend = doc_backend.load_page(0) + + rects = list(page_backend.get_bitmap_rects()) + assert len(rects) == 1 + bbox = rects[0] + assert bbox.l == 0.0 + assert bbox.t == 0.0 + assert bbox.r == float(width) + assert bbox.b == float(height) + assert bbox.coord_origin == CoordOrigin.TOPLEFT + + +def test_get_bitmap_rects_scaled(): + """Test getting bitmap rects with scaling.""" + width, height = 100, 80 + scale = 2.0 + stream = _make_png_stream(width=width, height=height) + doc_backend = _get_backend_from_stream(stream) + page_backend: _ImagePageBackend = doc_backend.load_page(0) + + rects = list(page_backend.get_bitmap_rects(scale=scale)) + assert len(rects) == 1 + bbox = rects[0] + assert bbox.l == 0.0 + assert bbox.t == 0.0 + assert bbox.r == float(width * scale) + assert bbox.b == float(height * scale) + assert bbox.coord_origin == CoordOrigin.TOPLEFT + + +def test_get_text_in_rect(): + """Test that get_text_in_rect returns empty string for images (no OCR).""" + stream = _make_png_stream() + doc_backend = _get_backend_from_stream(stream) + page_backend: _ImagePageBackend = doc_backend.load_page(0) + + bbox = BoundingBox(l=10, t=10, r=50, b=50, coord_origin=CoordOrigin.TOPLEFT) + text = page_backend.get_text_in_rect(bbox) + assert text == "" + + +def test_multipage_access(): + """Test accessing different pages in multi-page image.""" + num_pages = 4 + stream = _make_multipage_tiff_stream(num_pages=num_pages, size=(64, 64)) + doc_backend = _get_backend_from_stream(stream) + assert doc_backend.page_count() == num_pages + + # Access each page + for i in range(num_pages): + page_backend = doc_backend.load_page(i) + assert page_backend.is_valid() + size = page_backend.get_size() + assert size.width == 64 + assert size.height == 64 diff --git a/tests/test_backend_mets_gbs.py b/tests/test_backend_mets_gbs.py index 7003e957..eb07af9d 100644 --- a/tests/test_backend_mets_gbs.py +++ b/tests/test_backend_mets_gbs.py @@ -15,7 +15,7 @@ def test_doc_path(): def _get_backend(pdf_doc): in_doc = InputDocument( path_or_stream=pdf_doc, - format=InputFormat.PDF, + format=InputFormat.METS_GBS, backend=MetsGbsDocumentBackend, ) diff --git a/tests/test_backend_webp.py b/tests/test_backend_webp.py index ad97dc44..35958019 100644 --- a/tests/test_backend_webp.py +++ b/tests/test_backend_webp.py @@ -2,6 +2,8 @@ import sys from pathlib import Path from typing import List +from pydantic.type_adapter import R + from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.datamodel.pipeline_options import ( @@ -72,7 +74,9 @@ def test_e2e_webp_conversions(): for webp_path in webp_paths: print(f"converting {webp_path}") - doc_result: ConversionResult = converter.convert(webp_path) + doc_result: ConversionResult = converter.convert( + webp_path, raises_on_error=True + ) verify_conversion_result_v2( input_path=webp_path, diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 248de7d1..a0e6e85b 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -4,9 +4,6 @@ from pathlib import Path import pytest from pydantic import ValidationError -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend -from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.backend_options import ( @@ -17,7 +14,7 @@ from docling.datamodel.backend_options import ( from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.document import InputDocument, _DocumentConversionInput from docling.datamodel.settings import DocumentLimits -from docling.document_converter import PdfFormatOption +from docling.document_converter import ImageFormatOption, PdfFormatOption def test_in_doc_from_valid_path(): @@ -51,36 +48,6 @@ def test_in_doc_from_invalid_buf(): assert doc.valid is False -def test_image_in_pdf_backend(): - in_doc = InputDocument( - path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), - format=InputFormat.IMAGE, - backend=PyPdfiumDocumentBackend, - ) - - assert in_doc.valid - in_doc = InputDocument( - path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), - format=InputFormat.IMAGE, - backend=DoclingParseDocumentBackend, - ) - assert in_doc.valid - - in_doc = InputDocument( - path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), - format=InputFormat.IMAGE, - backend=DoclingParseV2DocumentBackend, - ) - assert in_doc.valid - - in_doc = InputDocument( - path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), - format=InputFormat.IMAGE, - backend=DoclingParseV4DocumentBackend, - ) - assert in_doc.valid - - def test_in_doc_with_page_range(): test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") limits = DocumentLimits() @@ -297,7 +264,7 @@ def test_tiff_two_pages(): doc = InputDocument( path_or_stream=tiff_path, format=InputFormat.IMAGE, - backend=PdfFormatOption().backend, # use default backend + backend=ImageFormatOption().backend, # use default backend ) assert doc.valid is True assert doc.page_count == 2