Fix multi-page TIFF image support

Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2025-07-10 15:04:28 +00:00
parent 420df478f3
commit c1d722725f
2 changed files with 161 additions and 1 deletions

View File

@ -57,7 +57,28 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
if self.input_format is InputFormat.IMAGE:
buf = BytesIO()
img = Image.open(self.path_or_stream)
img.save(buf, "PDF")
# Handle multi-page TIFF images
if hasattr(img, 'n_frames') and img.n_frames > 1:
# Extract all frames from multi-page image
frames = []
try:
for i in range(img.n_frames):
img.seek(i)
frames.append(img.copy())
except EOFError:
pass
# Save as multi-page PDF
if frames:
frames[0].save(buf, "PDF", save_all=True, append_images=frames[1:])
else:
# Fallback to single page if frame extraction fails
img.save(buf, "PDF")
else:
# Single page image - use existing behavior
img.save(buf, "PDF")
buf.seek(0)
self.path_or_stream = buf
else:

View File

@ -0,0 +1,139 @@
import os
import tempfile
from pathlib import Path
from io import BytesIO
import pypdfium2 as pdfium
from PIL import Image, ImageDraw, ImageFont
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
def create_test_multipage_image(format_name, num_pages=3):
"""Create a test multi-page image file."""
images = []
for i in range(num_pages):
# Create a unique image for each page
img = Image.new('RGB', (200, 150), color='white')
draw = ImageDraw.Draw(img)
# Draw page identifier
text = f"Page {i+1}"
draw.text((10, 10), text, fill='black')
# Draw some unique content for each page
colors = ['red', 'green', 'blue', 'orange', 'purple']
color = colors[i % len(colors)]
draw.rectangle([10, 30, 190, 140], outline=color, width=2)
images.append(img)
# Save as multi-page image
suffix = '.tif' if format_name == 'TIFF' else f'.{format_name.lower()}'
temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
temp_file.close()
try:
if images:
images[0].save(temp_file.name, format_name, save_all=True, append_images=images[1:])
return temp_file.name
except Exception as e:
# Clean up on error
if os.path.exists(temp_file.name):
os.unlink(temp_file.name)
raise e
def test_multipage_image_support():
"""Test that multi-page image files are properly handled."""
# Test with different image formats and numbers of pages
test_cases = [
('TIFF', 1, "single page TIFF"),
('TIFF', 2, "two-page TIFF"),
('TIFF', 3, "three-page TIFF"),
('GIF', 2, "two-page GIF"),
('WEBP', 2, "two-page WEBP"),
]
for format_name, num_pages, description in test_cases:
print(f"Testing {description}...")
# Create test file
test_file = create_test_multipage_image(format_name, num_pages)
try:
# Verify the image has the expected number of pages
with Image.open(test_file) as img:
expected_pages = getattr(img, 'n_frames', 1)
assert expected_pages == num_pages, f"Test {format_name} should have {num_pages} pages, got {expected_pages}"
# Test with docling backend
input_doc = InputDocument(
path_or_stream=Path(test_file),
format=InputFormat.IMAGE,
backend=DoclingParseV4DocumentBackend
)
backend = DoclingParseV4DocumentBackend(input_doc, Path(test_file))
# Check the page count in the backend
actual_pages = backend.page_count()
assert actual_pages == expected_pages, f"Backend should report {expected_pages} pages, got {actual_pages}"
# Also verify the PDF was created correctly
if isinstance(backend.path_or_stream, BytesIO):
backend.path_or_stream.seek(0)
pdf_doc = pdfium.PdfDocument(backend.path_or_stream)
pdf_pages = len(pdf_doc)
pdf_doc.close()
assert pdf_pages == expected_pages, f"PDF should have {expected_pages} pages, got {pdf_pages}"
print(f"{description} passed")
finally:
# Clean up
if os.path.exists(test_file):
os.unlink(test_file)
def test_single_page_image_unchanged():
"""Test that single-page image files still work as before."""
# Test different single-page formats
formats_to_test = ['TIFF', 'GIF', 'WEBP']
for format_name in formats_to_test:
print(f"Testing single-page {format_name}...")
# Create a single-page image
test_file = create_test_multipage_image(format_name, 1)
try:
# Test with docling backend
input_doc = InputDocument(
path_or_stream=Path(test_file),
format=InputFormat.IMAGE,
backend=DoclingParseV4DocumentBackend
)
backend = DoclingParseV4DocumentBackend(input_doc, Path(test_file))
# Should have exactly 1 page
assert backend.page_count() == 1, f"Single-page {format_name} should have 1 page, got {backend.page_count()}"
print(f"✅ Single-page {format_name} test passed")
finally:
# Clean up
if os.path.exists(test_file):
os.unlink(test_file)
if __name__ == "__main__":
test_multipage_image_support()
test_single_page_image_unchanged()
print("All tests passed!")