Merge branch 'main' of github.com:DS4SD/docling into nli/layout_heron

2025-07-25 03:24:59 +00:00 · 2025-07-23 15:03:31 +02:00 · 2025-07-23 15:03:31 +02:00 · 69e0123213
commit 69e0123213
parent fd0f06bba5 98e2fcff63
4 changed files with 55 additions and 2 deletions
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
            if self.input_format is InputFormat.IMAGE:
                buf = BytesIO()
                img = Image.open(self.path_or_stream)
-                img.save(buf, "PDF")
+
+                # Handle multi-page TIFF images
+                if hasattr(img, "n_frames") and img.n_frames > 1:
+                    # Extract all frames from multi-page image
+                    frames = []
+                    try:
+                        for i in range(img.n_frames):
+                            img.seek(i)
+                            frame = img.copy().convert("RGB")
+                            frames.append(frame)
+                    except EOFError:
+                        pass
+
+                    # Save as multi-page PDF
+                    if frames:
+                        frames[0].save(
+                            buf, "PDF", save_all=True, append_images=frames[1:]
+                        )
+                    else:
+                        # Fallback to single page if frame extraction fails
+                        img.convert("RGB").save(buf, "PDF")
+                else:
+                    # Single page image - convert to RGB and save
+                    img.convert("RGB").save(buf, "PDF")
+
                buf.seek(0)
                self.path_or_stream = buf
            else:
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@ -217,7 +217,13 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
        return conv_res

    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
-        status = ConversionStatus.SUCCESS
+        status = conv_res.status
+        if status in [
+            ConversionStatus.PENDING,
+            ConversionStatus.STARTED,
+        ]:  # preserves ConversionStatus.PARTIAL_SUCCESS
+            status = ConversionStatus.SUCCESS
+
        for page in conv_res.pages:
            if page._backend is None or not page._backend.is_valid():
                conv_res.errors.append(
--- a/tests/data/tiff/2206.01062.tif
+++ b/tests/data/tiff/2206.01062.tif
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@ -243,3 +243,26 @@ def _make_input_doc_from_stream(doc_stream):
        backend=PdfFormatOption().backend,  # use default
    )
    return in_doc
+
+
+def test_tiff_two_pages():
+    tiff_path = Path("./tests/data/tiff/2206.01062.tif")
+    doc = InputDocument(
+        path_or_stream=tiff_path,
+        format=InputFormat.IMAGE,
+        backend=PdfFormatOption().backend,  # use default backend
+    )
+    assert doc.valid is True
+    assert doc.page_count == 2
+
+    # Expect two full-page rectangles
+    rects_page1 = doc._backend.load_page(0).get_bitmap_rects()
+    rects_page2 = doc._backend.load_page(1).get_bitmap_rects()
+
+    page1_rect = next(rects_page1)
+    page2_rect = next(rects_page2)
+
+    assert page1_rect.t == page2_rect.t == 0
+    assert page1_rect.l == page2_rect.l == 0
+    assert page1_rect.r == page2_rect.r == 612.0
+    assert page1_rect.b == page2_rect.b == 792.0