Add image format support to PdfBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-13 07:08:19 +00:00 · 2024-10-11 16:47:15 +02:00
parent d0fccb9342
commit 6efcf0a5a5
12 changed files with 110 additions and 76 deletions
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -21,57 +21,34 @@ input_paths = [
    Path("tests/data/word_sample.docx"),
    Path("tests/data/lorem_ipsum.docx"),
    Path("tests/data/powerpoint_sample.pptx"),
+    Path("tests/data/2305.03393v1-pg9-img.png"),
    Path("tests/data/2206.01062.pdf"),
-    # Path("tests/data/2305.03393v1-pg9-img.png"),
 ]

 ## for defaults use:
 # doc_converter = DocumentConverter()

 ## to customize use:
-doc_converter = DocumentConverter(  # all of the below is optional, has internal defaults.
-    formats=[
-        InputFormat.PDF,
-        # InputFormat.IMAGE,
-        InputFormat.DOCX,
-        InputFormat.HTML,
-        InputFormat.PPTX,
-    ],  # whitelist formats, other files are ignored.
-    format_options={
-        InputFormat.PDF: PdfFormatOption(
-            pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
-        ),  # PdfFormatOption(backend=PyPdfiumDocumentBackend),
-        InputFormat.DOCX: WordFormatOption(
-            pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
-        ),
-        # InputFormat.IMAGE: PdfFormatOption(),
-    },
+doc_converter = (
+    DocumentConverter(  # all of the below is optional, has internal defaults.
+        formats=[
+            InputFormat.PDF,
+            InputFormat.IMAGE,
+            InputFormat.DOCX,
+            InputFormat.HTML,
+            InputFormat.PPTX,
+        ],  # whitelist formats, non-matching files are ignored.
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
+            ),
+            InputFormat.DOCX: WordFormatOption(
+                pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
+            ),
+        },
+    )
 )

-doc_converter = DocumentConverter(  # all of the below is optional, has internal defaults.
-    pdf=None,
-    docx=WordFormatOption(
-        pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
-    ),
-    formats=[
-        InputFormat.PDF,
-        # InputFormat.IMAGE,
-        InputFormat.DOCX,
-        InputFormat.HTML,
-        InputFormat.PPTX,
-    ],  # whitelist formats, other files are ignored.
-    format_options={
-        InputFormat.PDF: PdfFormatOption(
-            pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
-        ),  # PdfFormatOption(backend=PyPdfiumDocumentBackend),
-        InputFormat.DOCX: WordFormatOption(
-            pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
-        ),
-        # InputFormat.IMAGE: PdfFormatOption(),
-    },
-)
-
-
 conv_results = doc_converter.convert_all(input_paths)

 for res in conv_results: