From c1d722725f23bf59316fcf394a5b9434bbdc1e61 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 10 Jul 2025 15:04:28 +0000 Subject: [PATCH] Fix multi-page TIFF image support Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com> --- docling/backend/pdf_backend.py | 23 +++++- tests/test_multipage_tiff.py | 139 +++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 tests/test_multipage_tiff.py diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py index 3d07578b..cd4a07a3 100644 --- a/docling/backend/pdf_backend.py +++ b/docling/backend/pdf_backend.py @@ -57,7 +57,28 @@ class PdfDocumentBackend(PaginatedDocumentBackend): if self.input_format is InputFormat.IMAGE: buf = BytesIO() img = Image.open(self.path_or_stream) - img.save(buf, "PDF") + + # Handle multi-page TIFF images + if hasattr(img, 'n_frames') and img.n_frames > 1: + # Extract all frames from multi-page image + frames = [] + try: + for i in range(img.n_frames): + img.seek(i) + frames.append(img.copy()) + except EOFError: + pass + + # Save as multi-page PDF + if frames: + frames[0].save(buf, "PDF", save_all=True, append_images=frames[1:]) + else: + # Fallback to single page if frame extraction fails + img.save(buf, "PDF") + else: + # Single page image - use existing behavior + img.save(buf, "PDF") + buf.seek(0) self.path_or_stream = buf else: diff --git a/tests/test_multipage_tiff.py b/tests/test_multipage_tiff.py new file mode 100644 index 00000000..309c1aad --- /dev/null +++ b/tests/test_multipage_tiff.py @@ -0,0 +1,139 @@ +import os +import tempfile +from pathlib import Path +from io import BytesIO + +import pypdfium2 as pdfium +from PIL import Image, ImageDraw, ImageFont + +from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + + +def create_test_multipage_image(format_name, num_pages=3): + """Create a test multi-page image file.""" + images = [] + + for i in range(num_pages): + # Create a unique image for each page + img = Image.new('RGB', (200, 150), color='white') + draw = ImageDraw.Draw(img) + + # Draw page identifier + text = f"Page {i+1}" + draw.text((10, 10), text, fill='black') + + # Draw some unique content for each page + colors = ['red', 'green', 'blue', 'orange', 'purple'] + color = colors[i % len(colors)] + draw.rectangle([10, 30, 190, 140], outline=color, width=2) + + images.append(img) + + # Save as multi-page image + suffix = '.tif' if format_name == 'TIFF' else f'.{format_name.lower()}' + temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False) + temp_file.close() + + try: + if images: + images[0].save(temp_file.name, format_name, save_all=True, append_images=images[1:]) + return temp_file.name + except Exception as e: + # Clean up on error + if os.path.exists(temp_file.name): + os.unlink(temp_file.name) + raise e + + +def test_multipage_image_support(): + """Test that multi-page image files are properly handled.""" + + # Test with different image formats and numbers of pages + test_cases = [ + ('TIFF', 1, "single page TIFF"), + ('TIFF', 2, "two-page TIFF"), + ('TIFF', 3, "three-page TIFF"), + ('GIF', 2, "two-page GIF"), + ('WEBP', 2, "two-page WEBP"), + ] + + for format_name, num_pages, description in test_cases: + print(f"Testing {description}...") + + # Create test file + test_file = create_test_multipage_image(format_name, num_pages) + + try: + # Verify the image has the expected number of pages + with Image.open(test_file) as img: + expected_pages = getattr(img, 'n_frames', 1) + assert expected_pages == num_pages, f"Test {format_name} should have {num_pages} pages, got {expected_pages}" + + # Test with docling backend + input_doc = InputDocument( + path_or_stream=Path(test_file), + format=InputFormat.IMAGE, + backend=DoclingParseV4DocumentBackend + ) + + backend = DoclingParseV4DocumentBackend(input_doc, Path(test_file)) + + # Check the page count in the backend + actual_pages = backend.page_count() + assert actual_pages == expected_pages, f"Backend should report {expected_pages} pages, got {actual_pages}" + + # Also verify the PDF was created correctly + if isinstance(backend.path_or_stream, BytesIO): + backend.path_or_stream.seek(0) + pdf_doc = pdfium.PdfDocument(backend.path_or_stream) + pdf_pages = len(pdf_doc) + pdf_doc.close() + assert pdf_pages == expected_pages, f"PDF should have {expected_pages} pages, got {pdf_pages}" + + print(f"✅ {description} passed") + + finally: + # Clean up + if os.path.exists(test_file): + os.unlink(test_file) + + +def test_single_page_image_unchanged(): + """Test that single-page image files still work as before.""" + + # Test different single-page formats + formats_to_test = ['TIFF', 'GIF', 'WEBP'] + + for format_name in formats_to_test: + print(f"Testing single-page {format_name}...") + + # Create a single-page image + test_file = create_test_multipage_image(format_name, 1) + + try: + # Test with docling backend + input_doc = InputDocument( + path_or_stream=Path(test_file), + format=InputFormat.IMAGE, + backend=DoclingParseV4DocumentBackend + ) + + backend = DoclingParseV4DocumentBackend(input_doc, Path(test_file)) + + # Should have exactly 1 page + assert backend.page_count() == 1, f"Single-page {format_name} should have 1 page, got {backend.page_count()}" + + print(f"✅ Single-page {format_name} test passed") + + finally: + # Clean up + if os.path.exists(test_file): + os.unlink(test_file) + + +if __name__ == "__main__": + test_multipage_image_support() + test_single_page_image_unchanged() + print("All tests passed!") \ No newline at end of file