docling/docs/examples/gpu_standard_pipeline.py

# %% [markdown]
#
# What this example does
# - Run a conversion using the best setup for GPU for the standard pipeline
#
# Requirements
# - Python 3.9+
# - Install Docling: `pip install docling`
#
# How to run
# - `python docs/examples/gpu_standard_pipeline.py`
#
# This example is part of a set of GPU optimization strategies. Read more about it in [GPU support](../../usage/gpu/)
#
# ## Example code
# %%

import datetime
import logging
import time
from pathlib import Path

import numpy as np
from pydantic import TypeAdapter

from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.pipeline_options import (
    ThreadedPdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.threaded_standard_pdf_pipeline import ThreadedStandardPdfPipeline
from docling.utils.profiling import ProfilingItem

_log = logging.getLogger(__name__)


def main():
    logging.getLogger("docling").setLevel(logging.WARNING)
    _log.setLevel(logging.INFO)

    data_folder = Path(__file__).parent / "../../tests/data"
    # input_doc_path = data_folder / "pdf" / "2305.03393v1.pdf"  # 14 pages
    input_doc_path = data_folder / "pdf" / "redp5110_sampled.pdf"  # 18 pages

    pipeline_options = ThreadedPdfPipelineOptions(
        accelerator_options=AcceleratorOptions(
            device=AcceleratorDevice.CUDA,
        ),
        ocr_batch_size=4,
        layout_batch_size=64,
        table_batch_size=4,
    )
    pipeline_options.do_ocr = False

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=ThreadedStandardPdfPipeline,
                pipeline_options=pipeline_options,
            )
        }
    )

    start_time = time.time()
    doc_converter.initialize_pipeline(InputFormat.PDF)
    init_runtime = time.time() - start_time
    _log.info(f"Pipeline initialized in {init_runtime:.2f} seconds.")

    start_time = time.time()
    conv_result = doc_converter.convert(input_doc_path)
    pipeline_runtime = time.time() - start_time
    assert conv_result.status == ConversionStatus.SUCCESS

    num_pages = len(conv_result.pages)
    _log.info(f"Document converted in {pipeline_runtime:.2f} seconds.")
    _log.info(f"  {num_pages / pipeline_runtime:.2f} pages/second.")


if __name__ == "__main__":
    main()