Merge from simplify-conv-api

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-13 07:08:19 +00:00 · 2024-10-11 15:57:08 +02:00
parent 95c1f80087 136f16e85a
commit d0fccb9342
22 changed files with 286 additions and 380 deletions
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@@ -7,7 +7,7 @@ from typing import Iterable
 import yaml

 from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.document import ConversionResult
 from docling.document_converter import DocumentConverter

 _log = logging.getLogger(__name__)
@@ -125,18 +125,19 @@ def main():

    doc_converter = DocumentConverter()

-    input = DocumentConversionInput.from_paths(input_doc_paths)
-
    start_time = time.time()

-    conv_results = doc_converter.convert_batch(input)
+    conv_results = doc_converter.convert_all(
+        input_doc_paths,
+        raises_on_error=False,  # to let conversion run through all and examine results at the end
+    )
    success_count, partial_success_count, failure_count = export_documents(
        conv_results, output_dir=Path("./scratch")
    )

    end_time = time.time() - start_time

-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+    _log.info(f"Document conversion complete in {end_time:.2f} seconds.")

    if failure_count > 0:
        raise RuntimeError(
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@@ -5,9 +5,14 @@ from pathlib import Path
 from typing import Iterable

 from docling.datamodel.base_models import ConversionStatus, InputFormat
-from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline

 _log = logging.getLogger(__name__)

@@ -60,9 +65,7 @@ def export_documents(
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-    ]
+    input_doc_path = Path("./tests/data/2206.01062.pdf")

    ###########################################################################

@@ -147,24 +150,13 @@ def main():

    ###########################################################################

-    # Define input files
-    input = DocumentConversionInput.from_paths(input_doc_paths)
-
    start_time = time.time()

-    conv_results = doc_converter.convert_batch(input)
-    success_count, failure_count = export_documents(
-        conv_results, output_dir=Path("./scratch")
-    )
+    conv_result = doc_converter.convert(input_doc_path)

    end_time = time.time() - start_time

-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
-
-    if failure_count > 0:
-        raise RuntimeError(
-            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
-        )
+    _log.info(f"Document converted in {end_time:.2f} seconds.")


 if __name__ == "__main__":
--- a/examples/export_figures.py
+++ b/examples/export_figures.py
@@ -2,13 +2,7 @@ import logging
 import time
 from pathlib import Path

-from docling.datamodel.base_models import (
-    ConversionStatus,
-    FigureElement,
-    InputFormat,
-    Table,
-)
-from docling.datamodel.document import DocumentConversionInput
+from docling.datamodel.base_models import FigureElement, InputFormat, Table
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption

@@ -20,13 +14,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-    ]
+    input_doc_path = Path("./tests/data/2206.01062.pdf")
    output_dir = Path("./scratch")

-    input_files = DocumentConversionInput.from_paths(input_doc_paths)
-
    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
@@ -42,46 +32,29 @@ def main():

    start_time = time.time()

-    conv_results = doc_converter.convert_batch(input_files)
+    conv_res = doc_converter.convert(input_doc_path)

-    success_count = 0
-    failure_count = 0
    output_dir.mkdir(parents=True, exist_ok=True)
-    for conv_res in conv_results:
-        if conv_res.status != ConversionStatus.SUCCESS:
-            _log.info(f"Document {conv_res.input.file} failed to convert.")
-            failure_count += 1
-            continue
+    doc_filename = conv_res.input.file.stem

-        doc_filename = conv_res.input.file.stem
+    # Export page images
+    for page in conv_res.pages:
+        page_no = page.page_no + 1
+        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
+        with page_image_filename.open("wb") as fp:
+            page.image.save(fp, format="PNG")

-        # Export page images
-        for page in conv_res.pages:
-            page_no = page.page_no + 1
-            page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
-            with page_image_filename.open("wb") as fp:
-                page.image.save(fp, format="PNG")
-
-        # Export figures and tables
-        for element, image in conv_res.render_element_images(
-            element_types=(FigureElement, Table)
-        ):
-            element_image_filename = (
-                output_dir / f"{doc_filename}-element-{element.id}.png"
-            )
-            with element_image_filename.open("wb") as fp:
-                image.save(fp, "PNG")
-
-        success_count += 1
+    # Export figures and tables
+    for element, image in conv_res.render_element_images(
+        element_types=(FigureElement, Table)
+    ):
+        element_image_filename = output_dir / f"{doc_filename}-element-{element.id}.png"
+        with element_image_filename.open("wb") as fp:
+            image.save(fp, "PNG")

    end_time = time.time() - start_time

-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
-
-    if failure_count > 0:
-        raise RuntimeError(
-            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
-        )
+    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")


 if __name__ == "__main__":
--- a/examples/export_multimodal.py
+++ b/examples/export_multimodal.py
@@ -5,8 +5,7 @@ from pathlib import Path

 import pandas as pd

-from docling.datamodel.base_models import ConversionStatus, InputFormat
-from docling.datamodel.document import DocumentConversionInput
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.utils.export import generate_multimodal_pages
@@ -19,13 +18,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-    ]
+    input_doc_path = Path("./tests/data/2206.01062.pdf")
    output_dir = Path("./scratch")

-    input_files = DocumentConversionInput.from_paths(input_doc_paths)
-
    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
@@ -41,53 +36,45 @@ def main():

    start_time = time.time()

-    converted_docs = doc_converter.convert_batch(input_files)
+    conv_res = doc_converter.convert(input_doc_path)

-    success_count = 0
-    failure_count = 0
    output_dir.mkdir(parents=True, exist_ok=True)
-    for doc in converted_docs:
-        if doc.status != ConversionStatus.SUCCESS:
-            _log.info(f"Document {doc.input.file} failed to convert.")
-            failure_count += 1
-            continue

-        rows = []
-        for (
-            content_text,
-            content_md,
-            content_dt,
-            page_cells,
-            page_segments,
-            page,
-        ) in generate_multimodal_pages(doc):
+    rows = []
+    for (
+        content_text,
+        content_md,
+        content_dt,
+        page_cells,
+        page_segments,
+        page,
+    ) in generate_multimodal_pages(conv_res):

-            dpi = page._default_image_scale * 72
+        dpi = page._default_image_scale * 72

-            rows.append(
-                {
-                    "document": doc.input.file.name,
-                    "hash": doc.input.document_hash,
-                    "page_hash": page.page_hash,
-                    "image": {
-                        "width": page.image.width,
-                        "height": page.image.height,
-                        "bytes": page.image.tobytes(),
-                    },
-                    "cells": page_cells,
-                    "contents": content_text,
-                    "contents_md": content_md,
-                    "contents_dt": content_dt,
-                    "segments": page_segments,
-                    "extra": {
-                        "page_num": page.page_no + 1,
-                        "width_in_points": page.size.width,
-                        "height_in_points": page.size.height,
-                        "dpi": dpi,
-                    },
-                }
-            )
-        success_count += 1
+        rows.append(
+            {
+                "document": conv_res.input.file.name,
+                "hash": conv_res.input.document_hash,
+                "page_hash": page.page_hash,
+                "image": {
+                    "width": page.image.width,
+                    "height": page.image.height,
+                    "bytes": page.image.tobytes(),
+                },
+                "cells": page_cells,
+                "contents": content_text,
+                "contents_md": content_md,
+                "contents_dt": content_dt,
+                "segments": page_segments,
+                "extra": {
+                    "page_num": page.page_no + 1,
+                    "width_in_points": page.size.width,
+                    "height_in_points": page.size.height,
+                    "dpi": dpi,
+                },
+            }
+        )

    # Generate one parquet from all documents
    df = pd.json_normalize(rows)
@@ -97,12 +84,9 @@ def main():

    end_time = time.time() - start_time

-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
-
-    if failure_count > 0:
-        raise RuntimeError(
-            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
-        )
+    _log.info(
+        f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
+    )

    # This block demonstrates how the file can be opened with the HF datasets library
    # from datasets import Dataset
--- a/examples/export_tables.py
+++ b/examples/export_tables.py
@@ -4,8 +4,6 @@ from pathlib import Path

 import pandas as pd

-from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import DocumentConversionInput
 from docling.document_converter import DocumentConverter

 _log = logging.getLogger(__name__)
@@ -14,59 +12,39 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-    ]
+    input_doc_path = Path("./tests/data/2206.01062.pdf")
    output_dir = Path("./scratch")

-    input_files = DocumentConversionInput.from_paths(input_doc_paths)
-
    doc_converter = DocumentConverter()

    start_time = time.time()

-    conv_results = doc_converter.convert_batch(input_files)
+    conv_res = doc_converter.convert(input_doc_path)

-    success_count = 0
-    failure_count = 0
    output_dir.mkdir(parents=True, exist_ok=True)
-    for conv_res in conv_results:
-        if conv_res.status != ConversionStatus.SUCCESS:
-            _log.info(f"Document {conv_res.input.file} failed to convert.")
-            failure_count += 1
-            continue

-        doc_filename = conv_res.input.file.stem
+    doc_filename = conv_res.input.file.stem

-        # Export tables
-        for table_ix, table in enumerate(conv_res.legacy_output.tables):
-            table_df: pd.DataFrame = table.export_to_dataframe()
-            print(f"## Table {table_ix}")
-            print(table_df.to_markdown())
+    # Export tables
+    for table_ix, table in enumerate(conv_res.legacy_output.tables):
+        table_df: pd.DataFrame = table.export_to_dataframe()
+        print(f"## Table {table_ix}")
+        print(table_df.to_markdown())

-            # Save the table as csv
-            element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
-            _log.info(f"Saving CSV table to {element_csv_filename}")
-            table_df.to_csv(element_csv_filename)
+        # Save the table as csv
+        element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
+        _log.info(f"Saving CSV table to {element_csv_filename}")
+        table_df.to_csv(element_csv_filename)

-            # Save the table as html
-            element_html_filename = (
-                output_dir / f"{doc_filename}-table-{table_ix+1}.html"
-            )
-            _log.info(f"Saving HTML table to {element_html_filename}")
-            with element_html_filename.open("w") as fp:
-                fp.write(table.export_to_html())
-
-        success_count += 1
+        # Save the table as html
+        element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
+        _log.info(f"Saving HTML table to {element_html_filename}")
+        with element_html_filename.open("w") as fp:
+            fp.write(table.export_to_html())

    end_time = time.time() - start_time

-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
-
-    if failure_count > 0:
-        raise RuntimeError(
-            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
-        )
+    _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")


 if __name__ == "__main__":
--- a/examples/minimal.py
+++ b/examples/minimal.py
@@ -2,7 +2,7 @@ from docling.document_converter import DocumentConverter

 source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
 converter = DocumentConverter()
-result = converter.convert_single(source)
+result = converter.convert(source)
 print(result.output.export_to_markdown())  # output: ## Docling Technical Report [...]"
 # if the legacy output is needed, use this version
 # print(result.render_as_markdown_v1())  # output: ## Docling Technical Report [...]"
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -4,7 +4,6 @@ from pathlib import Path

 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import DocumentConversionInput
 from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
@@ -25,7 +24,6 @@ input_paths = [
    Path("tests/data/2206.01062.pdf"),
    # Path("tests/data/2305.03393v1-pg9-img.png"),
 ]
-input = DocumentConversionInput.from_paths(input_paths)

 ## for defaults use:
 # doc_converter = DocumentConverter()
@@ -50,12 +48,36 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
    },
 )

-conv_results = doc_converter.convert_batch(input)
+doc_converter = DocumentConverter(  # all of the below is optional, has internal defaults.
+    pdf=None,
+    docx=WordFormatOption(
+        pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
+    ),
+    formats=[
+        InputFormat.PDF,
+        # InputFormat.IMAGE,
+        InputFormat.DOCX,
+        InputFormat.HTML,
+        InputFormat.PPTX,
+    ],  # whitelist formats, other files are ignored.
+    format_options={
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
+        ),  # PdfFormatOption(backend=PyPdfiumDocumentBackend),
+        InputFormat.DOCX: WordFormatOption(
+            pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
+        ),
+        # InputFormat.IMAGE: PdfFormatOption(),
+    },
+)
+
+
+conv_results = doc_converter.convert_all(input_paths)

 for res in conv_results:
    out_path = Path("./scratch")
    print(
-        f"Document {res.input.file.name} converted with status {res.status}."
+        f"Document {res.input.file.name} converted."
        f"\nSaved markdown output to: {str(out_path)}"
    )
    # print(res.experimental.export_to_markdown())