diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py index 3d07578b..1b0d612e 100644 --- a/docling/backend/pdf_backend.py +++ b/docling/backend/pdf_backend.py @@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend): if self.input_format is InputFormat.IMAGE: buf = BytesIO() img = Image.open(self.path_or_stream) - img.save(buf, "PDF") + + # Handle multi-page TIFF images + if hasattr(img, "n_frames") and img.n_frames > 1: + # Extract all frames from multi-page image + frames = [] + try: + for i in range(img.n_frames): + img.seek(i) + frame = img.copy().convert("RGB") + frames.append(frame) + except EOFError: + pass + + # Save as multi-page PDF + if frames: + frames[0].save( + buf, "PDF", save_all=True, append_images=frames[1:] + ) + else: + # Fallback to single page if frame extraction fails + img.convert("RGB").save(buf, "PDF") + else: + # Single page image - convert to RGB and save + img.convert("RGB").save(buf, "PDF") + buf.seek(0) self.path_or_stream = buf else: diff --git a/tests/data/tiff/2206.01062.tif b/tests/data/tiff/2206.01062.tif new file mode 100644 index 00000000..bade736b Binary files /dev/null and b/tests/data/tiff/2206.01062.tif differ diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index d5e40f0f..29f1dafe 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -243,3 +243,26 @@ def _make_input_doc_from_stream(doc_stream): backend=PdfFormatOption().backend, # use default ) return in_doc + + +def test_tiff_two_pages(): + tiff_path = Path("./tests/data/tiff/2206.01062.tif") + doc = InputDocument( + path_or_stream=tiff_path, + format=InputFormat.IMAGE, + backend=PdfFormatOption().backend, # use default backend + ) + assert doc.valid is True + assert doc.page_count == 2 + + # Expect two full-page rectangles + rects_page1 = doc._backend.load_page(0).get_bitmap_rects() + rects_page2 = doc._backend.load_page(1).get_bitmap_rects() + + page1_rect = next(rects_page1) + page2_rect = next(rects_page2) + + assert page1_rect.t == page2_rect.t == 0 + assert page1_rect.l == page2_rect.l == 0 + assert page1_rect.r == page2_rect.r == 612.0 + assert page1_rect.b == page2_rect.b == 792.0