Update examples and test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-13 07:08:19 +00:00 · 2024-10-09 15:20:27 +02:00
parent 080042d06d
commit 0dfbd0b6fc
25 changed files with 181 additions and 150 deletions
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@@ -12,7 +12,7 @@ from docling.document_converter import DocumentConverter

 _log = logging.getLogger(__name__)

-USE_EXPERIMENTAL = False
+USE_EXPERIMENTAL = True


 def export_documents(
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@@ -7,7 +7,7 @@ from typing import Iterable
 from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
 from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, FormatOption
+from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline

 _log = logging.getLogger(__name__)
@@ -104,9 +104,7 @@ def main():

    doc_converter = DocumentConverter(
        format_options={
-            InputFormat.PDF: FormatOption(
-                pipeline_cls=StandardPdfModelPipeline, pipeline_options=pipeline_options
-            )
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -3,9 +3,15 @@ from pathlib import Path

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import DocumentConversionInput
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.document_converter import (
+    DocumentConverter,
+    FormatOption,
+    PdfFormatOption,
+    WordFormatOption,
+)
 from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
 from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline

@@ -22,23 +28,25 @@ input_paths = [
 ]
 input = DocumentConversionInput.from_paths(input_paths)

-# for defaults use:
-doc_converter = DocumentConverter()
+## for defaults use:
+# doc_converter = DocumentConverter()

-# to customize use:
-# doc_converter = DocumentConverter(  # all of the below is optional, has internal defaults.
-#     formats=[
-#         InputFormat.PDF,
-#         InputFormat.DOCX,
-#     ],  # whitelist formats, other files are ignored.
-#     format_options={
-#         InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend),
-#         InputFormat.DOCX: FormatOption(
-#             pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend
-#         ),
-#         # InputFormat.IMAGE: PdfFormatOption(),
-#     },
-# )
+## to customize use:
+doc_converter = DocumentConverter(  # all of the below is optional, has internal defaults.
+    formats=[
+        InputFormat.PDF,
+        InputFormat.DOCX,
+    ],  # whitelist formats, other files are ignored.
+    format_options={
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
+        ),  # PdfFormatOption(backend=PyPdfiumDocumentBackend),
+        InputFormat.DOCX: WordFormatOption(
+            pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
+        ),
+        # InputFormat.IMAGE: PdfFormatOption(),
+    },
+)

 conv_results = doc_converter.convert(input)