Merge remaining changes from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-13 07:08:19 +00:00 · 2024-10-15 10:52:16 +02:00
parent dac82ca7f2
commit fa5d972291
22 changed files with 1515 additions and 12 deletions
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@@ -1,139 +0,0 @@
-import json
-import logging
-import time
-from pathlib import Path
-from typing import Iterable
-
-import yaml
-
-from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import ConversionResult
-from docling.document_converter import DocumentConverter
-
-_log = logging.getLogger(__name__)
-
-USE_V2 = True
-USE_LEGACY = False
-
-
-def export_documents(
-    conv_results: Iterable[ConversionResult],
-    output_dir: Path,
-):
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    success_count = 0
-    failure_count = 0
-    partial_success_count = 0
-
-    for conv_res in conv_results:
-        if conv_res.status == ConversionStatus.SUCCESS:
-            success_count += 1
-            doc_filename = conv_res.input.file.stem
-
-            if USE_V2:
-                # Export Docling document format to JSON:
-                with (output_dir / f"{doc_filename}.json").open("w") as fp:
-                    fp.write(json.dumps(conv_res.document.export_to_dict()))
-
-                # Export Docling document format to YAML:
-                with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
-                    fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))
-
-                # Export Docling document format to doctags:
-                with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
-                    fp.write(conv_res.document.export_to_document_tokens())
-
-                # Export Docling document format to markdown:
-                with (output_dir / f"{doc_filename}.md").open("w") as fp:
-                    fp.write(conv_res.document.export_to_markdown())
-
-                # Export Docling document format to text:
-                with (output_dir / f"{doc_filename}.txt").open("w") as fp:
-                    fp.write(conv_res.document.export_to_markdown(strict_text=True))
-
-            if USE_LEGACY:
-                # Export Deep Search document JSON format:
-                with (output_dir / f"{doc_filename}.legacy.json").open(
-                    "w", encoding="utf-8"
-                ) as fp:
-                    fp.write(json.dumps(conv_res.legacy_document.export_to_dict()))
-
-                # Export Text format:
-                with (output_dir / f"{doc_filename}.legacy.txt").open(
-                    "w", encoding="utf-8"
-                ) as fp:
-                    fp.write(
-                        conv_res.legacy_document.export_to_markdown(strict_text=True)
-                    )
-
-                # Export Markdown format:
-                with (output_dir / f"{doc_filename}.legacy.md").open(
-                    "w", encoding="utf-8"
-                ) as fp:
-                    fp.write(conv_res.legacy_document.export_to_markdown())
-
-                # Export Document Tags format:
-                with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
-                    "w", encoding="utf-8"
-                ) as fp:
-                    fp.write(conv_res.legacy_document.export_to_doctags())
-
-        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
-            _log.info(
-                f"Document {conv_res.input.file} was partially converted with the following errors:"
-            )
-            for item in conv_res.errors:
-                _log.info(f"\t{item.error_message}")
-            partial_success_count += 1
-        else:
-            _log.info(f"Document {conv_res.input.file} failed to convert.")
-            failure_count += 1
-
-    _log.info(
-        f"Processed {success_count + partial_success_count + failure_count} docs, "
-        f"of which {failure_count} failed "
-        f"and {partial_success_count} were partially converted."
-    )
-    return success_count, partial_success_count, failure_count
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-        Path("./tests/data/2203.01017v2.pdf"),
-        Path("./tests/data/2305.03393v1.pdf"),
-        Path("./tests/data/redp5110.pdf"),
-        Path("./tests/data/redp5695.pdf"),
-    ]
-
-    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
-    # docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
-    # input = DocumentConversionInput.from_streams(docs)
-
-    doc_converter = DocumentConverter()
-
-    start_time = time.time()
-
-    conv_results = doc_converter.convert_all(
-        input_doc_paths,
-        raises_on_error=False,  # to let conversion run through all and examine results at the end
-    )
-    success_count, partial_success_count, failure_count = export_documents(
-        conv_results, output_dir=Path("./scratch")
-    )
-
-    end_time = time.time() - start_time
-
-    _log.info(f"Document conversion complete in {end_time:.2f} seconds.")
-
-    if failure_count > 0:
-        raise RuntimeError(
-            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
-        )
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@@ -1,138 +0,0 @@
-import json
-import logging
-import time
-from pathlib import Path
-from typing import Iterable
-
-from docling.datamodel.base_models import ConversionStatus, InputFormat
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    PdfPipelineOptions,
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
-)
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
-
-_log = logging.getLogger(__name__)
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_path = Path("./tests/data/2206.01062.pdf")
-
-    ###########################################################################
-
-    # The following sections contain a combination of PipelineOptions
-    # and PDF Backends for various configurations.
-    # Uncomment one section at the time to see the differences in the output.
-
-    # PyPdfium without EasyOCR
-    # --------------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=False
-    # pipeline_options.do_table_structure=True
-    # pipeline_options.table_structure_options.do_cell_matching = False
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=PyPdfiumDocumentBackend,
-    # )
-
-    # PyPdfium with EasyOCR
-    # -----------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=True
-    # pipeline_options.do_table_structure=True
-    # pipeline_options.table_structure_options.do_cell_matching = True
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=PyPdfiumDocumentBackend,
-    # )
-
-    # Docling Parse without EasyOCR
-    # -------------------------
-    pipeline_options = PdfPipelineOptions()
-    pipeline_options.do_ocr = False
-    pipeline_options.do_table_structure = True
-    pipeline_options.table_structure_options.do_cell_matching = True
-    pipeline_options.do_dummy_picture_classifer = True
-
-    doc_converter = DocumentConverter(
-        format_options={
-            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
-        }
-    )
-
-    # Docling Parse with EasyOCR
-    # ----------------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=True
-    # pipeline_options.do_table_structure=True
-    # pipeline_options.table_structure_options.do_cell_matching = True
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=DoclingParseDocumentBackend,
-    # )
-
-    # Docling Parse with Tesseract
-    # ----------------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr = True
-    # pipeline_options.do_table_structure = True
-    # pipeline_options.table_structure_options.do_cell_matching = True
-    # pipeline_options.ocr_options = TesseractOcrOptions()
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=DoclingParseDocumentBackend,
-    # )
-
-    # Docling Parse with Tesseract CLI
-    # ----------------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr = True
-    # pipeline_options.do_table_structure = True
-    # pipeline_options.table_structure_options.do_cell_matching = True
-    # pipeline_options.ocr_options = TesseractCliOcrOptions()
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=DoclingParseDocumentBackend,
-    # )
-
-    ###########################################################################
-
-    start_time = time.time()
-    conv_result = doc_converter.convert(input_doc_path)
-    end_time = time.time() - start_time
-
-    _log.info(f"Document converted in {end_time:.2f} seconds.")
-
-    ## Export results
-    output_dir = Path("./scratch")
-    output_dir.mkdir(parents=True, exist_ok=True)
-    doc_filename = conv_result.input.file.stem
-
-    # Export Deep Search document JSON format:
-    with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
-        fp.write(json.dumps(conv_result.document.export_to_dict()))
-
-    # Export Text format:
-    with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
-        fp.write(conv_result.document.export_to_text())
-
-    # Export Markdown format:
-    with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
-        fp.write(conv_result.document.export_to_markdown())
-
-    # Export Document Tags format:
-    with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
-        fp.write(conv_result.document.export_to_document_tokens())
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/export_figures.py
+++ b/examples/export_figures.py
@@ -1,61 +0,0 @@
-import logging
-import time
-from pathlib import Path
-
-from docling.datamodel.base_models import FigureElement, InputFormat, Table
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption
-
-_log = logging.getLogger(__name__)
-
-IMAGE_RESOLUTION_SCALE = 2.0
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_path = Path("./tests/data/2206.01062.pdf")
-    output_dir = Path("./scratch")
-
-    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
-    # will destroy them for cleaning up memory.
-    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
-    # scale=1 correspond of a standard 72 DPI image
-    pipeline_options = PdfPipelineOptions()
-    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
-
-    doc_converter = DocumentConverter(
-        format_options={
-            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
-        }
-    )
-
-    start_time = time.time()
-
-    conv_res = doc_converter.convert(input_doc_path)
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-    doc_filename = conv_res.input.file.stem
-
-    # Export page images
-    for page in conv_res.pages:
-        page_no = page.page_no + 1
-        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
-        with page_image_filename.open("wb") as fp:
-            page.image.save(fp, format="PNG")
-
-    # Export figures and tables
-    for element, image in conv_res.render_element_images(
-        element_types=(FigureElement, Table)
-    ):
-        element_image_filename = output_dir / f"{doc_filename}-element-{element.id}.png"
-        with element_image_filename.open("wb") as fp:
-            image.save(fp, "PNG")
-
-    end_time = time.time() - start_time
-
-    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/export_multimodal.py
+++ b/examples/export_multimodal.py
@@ -1,108 +0,0 @@
-import datetime
-import logging
-import time
-from pathlib import Path
-
-import pandas as pd
-
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling.utils.export import generate_multimodal_pages
-from docling.utils.utils import create_hash
-
-_log = logging.getLogger(__name__)
-
-IMAGE_RESOLUTION_SCALE = 2.0
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_path = Path("./tests/data/2206.01062.pdf")
-    output_dir = Path("./scratch")
-
-    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
-    # will destroy them for cleaning up memory.
-    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
-    # scale=1 correspond of a standard 72 DPI image
-    pipeline_options = PdfPipelineOptions()
-    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
-
-    doc_converter = DocumentConverter(
-        format_options={
-            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
-        }
-    )
-
-    start_time = time.time()
-
-    conv_res = doc_converter.convert(input_doc_path)
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    rows = []
-    for (
-        content_text,
-        content_md,
-        content_dt,
-        page_cells,
-        page_segments,
-        page,
-    ) in generate_multimodal_pages(conv_res):
-
-        dpi = page._default_image_scale * 72
-
-        rows.append(
-            {
-                "document": conv_res.input.file.name,
-                "hash": conv_res.input.document_hash,
-                "page_hash": create_hash(
-                    conv_res.input.document_hash + ":" + str(page.page_no - 1)
-                ),
-                "image": {
-                    "width": page.image.width,
-                    "height": page.image.height,
-                    "bytes": page.image.tobytes(),
-                },
-                "cells": page_cells,
-                "contents": content_text,
-                "contents_md": content_md,
-                "contents_dt": content_dt,
-                "segments": page_segments,
-                "extra": {
-                    "page_num": page.page_no + 1,
-                    "width_in_points": page.size.width,
-                    "height_in_points": page.size.height,
-                    "dpi": dpi,
-                },
-            }
-        )
-
-    # Generate one parquet from all documents
-    df = pd.json_normalize(rows)
-    now = datetime.datetime.now()
-    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
-    df.to_parquet(output_filename)
-
-    end_time = time.time() - start_time
-
-    _log.info(
-        f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
-    )
-
-    # This block demonstrates how the file can be opened with the HF datasets library
-    # from datasets import Dataset
-    # from PIL import Image
-    # multimodal_df = pd.read_parquet(output_filename)
-
-    # # Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
-    # dataset = Dataset.from_pandas(multimodal_df)
-    # def transforms(examples):
-    #     examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
-    #     return examples
-    # dataset = dataset.map(transforms)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/export_tables.py
+++ b/examples/export_tables.py
@@ -1,51 +0,0 @@
-import logging
-import time
-from pathlib import Path
-
-import pandas as pd
-
-from docling.document_converter import DocumentConverter
-
-_log = logging.getLogger(__name__)
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_path = Path("./tests/data/2206.01062.pdf")
-    output_dir = Path("./scratch")
-
-    doc_converter = DocumentConverter()
-
-    start_time = time.time()
-
-    conv_res = doc_converter.convert(input_doc_path)
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    doc_filename = conv_res.input.file.stem
-
-    # Export tables
-    for table_ix, table in enumerate(conv_res.document.tables):
-        table_df: pd.DataFrame = table.export_to_dataframe()
-        print(f"## Table {table_ix}")
-        print(table_df.to_markdown())
-
-        # Save the table as csv
-        element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
-        _log.info(f"Saving CSV table to {element_csv_filename}")
-        table_df.to_csv(element_csv_filename)
-
-        # Save the table as html
-        element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
-        _log.info(f"Saving HTML table to {element_html_filename}")
-        with element_html_filename.open("w") as fp:
-            fp.write(table.export_to_html())
-
-    end_time = time.time() - start_time
-
-    _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/minimal.py
+++ b/examples/minimal.py
@@ -1,10 +0,0 @@
-from docling.document_converter import DocumentConverter
-
-source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
-converter = DocumentConverter()
-result = converter.convert(source)
-print(
-    result.document.export_to_markdown()
-)  # output: ## Docling Technical Report [...]"
-# if the legacy output is needed, use this version
-# print(result.legacy_output.export_to_markdown())  # output: ## Docling Technical Report [...]"
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -1,67 +0,0 @@
-import json
-import logging
-from pathlib import Path
-
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.document_converter import (
-    DocumentConverter,
-    PdfFormatOption,
-    WordFormatOption,
-)
-from docling.pipeline.simple_pipeline import SimplePipeline
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
-
-_log = logging.getLogger(__name__)
-
-USE_EXPERIMENTAL = False
-
-input_paths = [
-    Path("tests/data/wiki_duck.html"),
-    Path("tests/data/word_sample.docx"),
-    Path("tests/data/lorem_ipsum.docx"),
-    Path("tests/data/powerpoint_sample.pptx"),
-    Path("tests/data/2305.03393v1-pg9-img.png"),
-    Path("tests/data/2206.01062.pdf"),
-]
-
-## for defaults use:
-# doc_converter = DocumentConverter()
-
-## to customize use:
-
-doc_converter = (
-    DocumentConverter(  # all of the below is optional, has internal defaults.
-        allowed_formats=[
-            InputFormat.PDF,
-            InputFormat.IMAGE,
-            InputFormat.DOCX,
-            InputFormat.HTML,
-            InputFormat.PPTX,
-        ],  # whitelist formats, non-matching files are ignored.
-        format_options={
-            InputFormat.PDF: PdfFormatOption(
-                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
-            ),
-            InputFormat.DOCX: WordFormatOption(
-                pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
-            ),
-        },
-    )
-)
-
-conv_results = doc_converter.convert_all(input_paths)
-
-for res in conv_results:
-    out_path = Path("./scratch")
-    print(
-        f"Document {res.input.file.name} converted."
-        f"\nSaved markdown output to: {str(out_path)}"
-    )
-    # print(res.docdocument.export_to_markdown())
-    # Export Docling document format to markdowndoc:
-    with (out_path / f"{res.input.file.name}.md").open("w") as fp:
-        fp.write(res.document.export_to_markdown())
-
-    with (out_path / f"{res.input.file.name}.json").open("w") as fp:
-        fp.write(json.dumps(res.document.export_to_dict()))