feat: add the Image backend (#2627)

* feat: add the Image backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pre-commit Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fixed single- versus multi-frame image formats Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix: Proper usage of ImageDocumentBackend in the pipeline, deprecate old code. Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Adapt tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: correct mets_gbs backend test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Make ImagePageBackend.get_bitmap_rects() yield Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-11 22:28:31 +00:00 · 2025-11-17 11:37:22 +01:00
parent ae30373ee7
commit 3495b73de8
12 changed files with 494 additions and 82 deletions
--- a/docling/backend/image_backend.py
+++ b/docling/backend/image_backend.py
@@ -0,0 +1,188 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Iterable, List, Optional, Union
+
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
+from PIL import Image
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.backend_options import PdfBackendOptions
+from docling.datamodel.base_models import InputFormat, Size
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class _ImagePageBackend(PdfPageBackend):
+    def __init__(self, image: Image.Image):
+        self._image: Optional[Image.Image] = image
+        self.valid: bool = self._image is not None
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        # No text extraction from raw images without OCR
+        return ""
+
+    def get_segmented_page(self) -> SegmentedPdfPage:
+        # Return empty segmented page with proper dimensions for raw images
+        assert self._image is not None
+        page_size = self.get_size()
+        bbox = BoundingBox(
+            l=0.0,
+            t=0.0,
+            r=float(page_size.width),
+            b=float(page_size.height),
+            coord_origin=CoordOrigin.BOTTOMLEFT,
+        )
+        dimension = PdfPageGeometry(
+            angle=0.0,
+            rect=BoundingRectangle.from_bounding_box(bbox),
+            boundary_type=PdfPageBoundaryType.CROP_BOX,
+            art_bbox=bbox,
+            bleed_bbox=bbox,
+            crop_bbox=bbox,
+            media_bbox=bbox,
+            trim_bbox=bbox,
+        )
+        return SegmentedPdfPage(
+            dimension=dimension,
+            char_cells=[],
+            word_cells=[],
+            textline_cells=[],
+            has_chars=False,
+            has_words=False,
+            has_lines=False,
+        )
+
+    def get_text_cells(self) -> Iterable[TextCell]:
+        # No text cells on raw images
+        return []
+
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        # For raw images, the entire page is a bitmap
+        assert self._image is not None
+        page_size = self.get_size()
+        full_page_bbox = BoundingBox(
+            l=0.0,
+            t=0.0,
+            r=float(page_size.width),
+            b=float(page_size.height),
+            coord_origin=CoordOrigin.TOPLEFT,
+        )
+        if scale != 1:
+            full_page_bbox = full_page_bbox.scaled(scale=scale)
+        yield full_page_bbox
+
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+        assert self._image is not None
+        img = self._image
+
+        if cropbox is not None:
+            # Expected cropbox comes in TOPLEFT coords in our pipeline
+            if cropbox.coord_origin != CoordOrigin.TOPLEFT:
+                # Convert to TOPLEFT relative to current image height
+                cropbox = cropbox.to_top_left_origin(img.height)
+            left, top, right, bottom = cropbox.as_tuple()
+            left = max(0, round(left))
+            top = max(0, round(top))
+            right = min(img.width, round(right))
+            bottom = min(img.height, round(bottom))
+            img = img.crop((left, top, right, bottom))
+
+        if scale != 1:
+            new_w = max(1, round(img.width * scale))
+            new_h = max(1, round(img.height * scale))
+            img = img.resize((new_w, new_h))
+
+        return img
+
+    def get_size(self) -> Size:
+        assert self._image is not None
+        return Size(width=self._image.width, height=self._image.height)
+
+    def unload(self):
+        # Help GC and free memory
+        self._image = None
+
+
+class ImageDocumentBackend(PdfDocumentBackend):
+    """Image-native backend that bypasses pypdfium2.
+
+    Notes:
+        - Subclasses PdfDocumentBackend to satisfy pipeline type checks.
+        - Intentionally avoids calling PdfDocumentBackend.__init__ to skip
+          the image→PDF conversion and any pypdfium2 usage.
+        - Handles multi-page TIFF by extracting frames eagerly to separate
+          Image objects to keep thread-safety when pages process in parallel.
+    """
+
+    def __init__(
+        self,
+        in_doc: InputDocument,
+        path_or_stream: Union[BytesIO, Path],
+        options: PdfBackendOptions = PdfBackendOptions(),
+    ):
+        # Bypass PdfDocumentBackend.__init__ to avoid image→PDF conversion
+        AbstractDocumentBackend.__init__(self, in_doc, path_or_stream, options)
+        self.options: PdfBackendOptions = options
+
+        if self.input_format not in {InputFormat.IMAGE}:
+            raise RuntimeError(
+                f"Incompatible file format {self.input_format} was passed to ImageDocumentBackend."
+            )
+
+        # Load frames eagerly for thread-safety across pages
+        self._frames: List[Image.Image] = []
+        try:
+            img = Image.open(self.path_or_stream)  # type: ignore[arg-type]
+
+            # Handle multi-frame and single-frame images
+            # - multiframe formats: TIFF, GIF, ICO
+            # - singleframe formats: JPEG (.jpg, .jpeg), PNG (.png), BMP, WEBP (unless animated), HEIC
+            frame_count = getattr(img, "n_frames", 1)
+
+            if frame_count > 1:
+                for i in range(frame_count):
+                    img.seek(i)
+                    self._frames.append(img.copy().convert("RGB"))
+            else:
+                self._frames.append(img.convert("RGB"))
+        except Exception as e:
+            raise RuntimeError(f"Could not load image for document {self.file}") from e
+
+    def is_valid(self) -> bool:
+        return len(self._frames) > 0
+
+    def page_count(self) -> int:
+        return len(self._frames)
+
+    def load_page(self, page_no: int) -> _ImagePageBackend:
+        if not (0 <= page_no < len(self._frames)):
+            raise IndexError(f"Page index out of range: {page_no}")
+        return _ImagePageBackend(self._frames[page_no])
+
+    @classmethod
+    def supported_formats(cls) -> set[InputFormat]:
+        # Only IMAGE here; PDF handling remains in PDF-oriented backends
+        return {InputFormat.IMAGE}
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True
+
+    def unload(self):
+        super().unload()
+        self._frames = []
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@@ -60,41 +60,10 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
        super().__init__(in_doc, path_or_stream, options)
        self.options: PdfBackendOptions

-        if self.input_format is not InputFormat.PDF:
-            if self.input_format is InputFormat.IMAGE:
-                buf = BytesIO()
-                img = Image.open(self.path_or_stream)
-
-                # Handle multi-page TIFF images
-                if hasattr(img, "n_frames") and img.n_frames > 1:
-                    # Extract all frames from multi-page image
-                    frames = []
-                    try:
-                        for i in range(img.n_frames):
-                            img.seek(i)
-                            frame = img.copy().convert("RGB")
-                            frames.append(frame)
-                    except EOFError:
-                        pass
-
-                    # Save as multi-page PDF
-                    if frames:
-                        frames[0].save(
-                            buf, "PDF", save_all=True, append_images=frames[1:]
-                        )
-                    else:
-                        # Fallback to single page if frame extraction fails
-                        img.convert("RGB").save(buf, "PDF")
-                else:
-                    # Single page image - convert to RGB and save
-                    img.convert("RGB").save(buf, "PDF")
-
-                buf.seek(0)
-                self.path_or_stream = buf
-            elif self.input_format not in self.supported_formats():
-                raise RuntimeError(
-                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
-                )
+        if self.input_format not in self.supported_formats():
+            raise RuntimeError(
+                f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
+            )

    @abstractmethod
    def load_page(self, page_no: int) -> PdfPageBackend:
@@ -106,7 +75,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):

    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.PDF, InputFormat.IMAGE}
+        return {InputFormat.PDF}

    @classmethod
    def supports_pagination(cls) -> bool:
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -26,6 +26,7 @@ from rich.console import Console
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.image_backend import ImageDocumentBackend
 from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -698,9 +699,16 @@ def convert(  # noqa: C901
            if artifacts_path is not None:
                simple_format_option.artifacts_path = artifacts_path

+            # Use image-native backend for IMAGE to avoid pypdfium2 locking
+            image_format_option = PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=ImageDocumentBackend,
+                backend_options=pdf_backend_options,
+            )
+
            format_options = {
                InputFormat.PDF: pdf_format_option,
-                InputFormat.IMAGE: pdf_format_option,
+                InputFormat.IMAGE: image_format_option,
                InputFormat.METS_GBS: mets_gbs_format_option,
                InputFormat.DOCX: WordFormatOption(
                    pipeline_options=simple_format_option
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -164,6 +164,7 @@ class DoclingComponentType(str, Enum):
    MODEL = "model"
    DOC_ASSEMBLER = "doc_assembler"
    USER_INPUT = "user_input"
+    PIPELINE = "pipeline"


 class VlmStopReason(str, Enum):
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -3,6 +3,7 @@ import logging
 import sys
 import threading
 import time
+import warnings
 from collections.abc import Iterable, Iterator
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
@@ -21,6 +22,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.csv_backend import CsvDocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.image_backend import ImageDocumentBackend
 from docling.backend.json.docling_json_backend import DoclingJSONBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
@@ -129,7 +131,7 @@ class XMLJatsFormatOption(FormatOption):

 class ImageFormatOption(FormatOption):
    pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
+    backend: Type[AbstractDocumentBackend] = ImageDocumentBackend


 class PdfFormatOption(FormatOption):
@@ -184,10 +186,35 @@ class DocumentConverter:
        self.allowed_formats = (
            allowed_formats if allowed_formats is not None else list(InputFormat)
        )
+
+        # Normalize format options: ensure IMAGE format uses ImageDocumentBackend
+        # for backward compatibility (old code might use PdfFormatOption or other backends for images)
+        normalized_format_options: dict[InputFormat, FormatOption] = {}
+        if format_options:
+            for format, option in format_options.items():
+                if (
+                    format == InputFormat.IMAGE
+                    and option.backend is not ImageDocumentBackend
+                ):
+                    warnings.warn(
+                        f"Using {option.backend.__name__} for InputFormat.IMAGE is deprecated. "
+                        "Images should use ImageDocumentBackend via ImageFormatOption. "
+                        "Automatically correcting the backend, please update your code to avoid this warning.",
+                        DeprecationWarning,
+                        stacklevel=2,
+                    )
+                    # Convert to ImageFormatOption while preserving pipeline and backend options
+                    normalized_format_options[format] = ImageFormatOption(
+                        pipeline_options=option.pipeline_options,
+                        backend_options=option.backend_options,
+                    )
+                else:
+                    normalized_format_options[format] = option
+
        self.format_to_options: dict[InputFormat, FormatOption] = {
            format: (
                _get_default_option(format=format)
-                if (custom_option := (format_options or {}).get(format)) is None
+                if (custom_option := normalized_format_options.get(format)) is None
                else custom_option
            )
            for format in self.allowed_formats
@@ -263,8 +290,12 @@ class DocumentConverter:
                ConversionStatus.SUCCESS,
                ConversionStatus.PARTIAL_SUCCESS,
            }:
+                error_details = ""
+                if conv_res.errors:
+                    error_messages = [err.error_message for err in conv_res.errors]
+                    error_details = f" Errors: {'; '.join(error_messages)}"
                raise ConversionError(
-                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
+                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}.{error_details}"
                )
            else:
                yield conv_res
--- a/docling/document_extractor.py
+++ b/docling/document_extractor.py
@@ -14,6 +14,7 @@ from pydantic import ConfigDict, model_validator, validate_call
 from typing_extensions import Self

 from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.image_backend import ImageDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
    BaseFormatOption,
@@ -72,7 +73,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
    the format registry between convert/extract.
    """
    format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
-        InputFormat.IMAGE: PyPdfiumDocumentBackend,
+        InputFormat.IMAGE: ImageDocumentBackend,
        InputFormat.PDF: PyPdfiumDocumentBackend,
    }

--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@@ -76,8 +76,15 @@ class BasePipeline(ABC):
                conv_res.status = self._determine_status(conv_res)
        except Exception as e:
            conv_res.status = ConversionStatus.FAILURE
-            if raises_on_error:
-                raise e
+            if not raises_on_error:
+                error_item = ErrorItem(
+                    component_type=DoclingComponentType.PIPELINE,
+                    module_name=self.__class__.__name__,
+                    error_message=str(e),
+                )
+                conv_res.errors.append(error_item)
+            else:
+                raise RuntimeError(f"Pipeline {self.__class__.__name__} failed") from e
        finally:
            self._unload(conv_res)

--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -30,7 +30,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem

 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
+from docling.datamodel.base_models import (
+    AssembledUnit,
+    ConversionStatus,
+    DoclingComponentType,
+    ErrorItem,
+    Page,
+)
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
 from docling.datamodel.settings import settings
@@ -265,7 +271,9 @@ class ThreadedPipelineStage:
                        )
                    )
            except Exception as exc:
-                _log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
+                _log.error(
+                    "Stage %s failed for run %d: %s", self.name, rid, exc, exc_info=True
+                )
                for it in items:
                    it.is_failed = True
                    it.error = exc
@@ -598,6 +606,16 @@ class StandardPdfPipeline(ConvertPipeline):
            if p.page_no in page_map
            or not any(fp == p.page_no for fp, _ in proc.failed_pages)
        ]
+        # Add error details from failed pages
+        for page_no, error in proc.failed_pages:
+            page_label = f"Page {page_no + 1}" if page_no >= 0 else "Unknown page"
+            error_msg = str(error) if error else ""
+            error_item = ErrorItem(
+                component_type=DoclingComponentType.PIPELINE,
+                module_name=self.__class__.__name__,
+                error_message=f"{page_label}: {error_msg}" if error_msg else page_label,
+            )
+            conv_res.errors.append(error_item)
        if proc.is_complete_failure:
            conv_res.status = ConversionStatus.FAILURE
        elif proc.is_partial_success: