diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index ca8fcfe6..4dbd4161 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -20,7 +20,7 @@ env: tests/test_asr_pipeline.py tests/test_threaded_pipeline.py PYTEST_TO_SKIP: |- - EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example)\.py$' + EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example|gpu_standard_pipeline|gpu_vlm_pipeline)\.py$' jobs: lint: diff --git a/docs/examples/gpu_standard_pipeline.py b/docs/examples/gpu_standard_pipeline.py new file mode 100644 index 00000000..12188e25 --- /dev/null +++ b/docs/examples/gpu_standard_pipeline.py @@ -0,0 +1,64 @@ +import datetime +import logging +import time +from pathlib import Path + +import numpy as np +from pydantic import TypeAdapter + +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions +from docling.datamodel.base_models import ConversionStatus, InputFormat +from docling.datamodel.pipeline_options import ( + ThreadedPdfPipelineOptions, +) +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.pipeline.threaded_standard_pdf_pipeline import ThreadedStandardPdfPipeline +from docling.utils.profiling import ProfilingItem + +_log = logging.getLogger(__name__) + + +def main(): + logging.getLogger("docling").setLevel(logging.WARNING) + _log.setLevel(logging.INFO) + + data_folder = Path(__file__).parent / "../../tests/data" + # input_doc_path = data_folder / "pdf" / "2305.03393v1.pdf" # 14 pages + input_doc_path = data_folder / "pdf" / "redp5110_sampled.pdf" # 18 pages + + pipeline_options = ThreadedPdfPipelineOptions( + accelerator_options=AcceleratorOptions( + device=AcceleratorDevice.CUDA, + ), + ocr_batch_size=4, + layout_batch_size=64, + table_batch_size=4, + ) + pipeline_options.do_ocr = False + + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=ThreadedStandardPdfPipeline, + pipeline_options=pipeline_options, + ) + } + ) + + start_time = time.time() + doc_converter.initialize_pipeline(InputFormat.PDF) + init_runtime = time.time() - start_time + _log.info(f"Pipeline initialized in {init_runtime:.2f} seconds.") + + start_time = time.time() + conv_result = doc_converter.convert(input_doc_path) + pipeline_runtime = time.time() - start_time + assert conv_result.status == ConversionStatus.SUCCESS + + num_pages = len(conv_result.pages) + _log.info(f"Document converted in {pipeline_runtime:.2f} seconds.") + _log.info(f" {num_pages / pipeline_runtime:.2f} pages/second.") + + +if __name__ == "__main__": + main() diff --git a/docs/examples/gpu_vlm_pipeline.py b/docs/examples/gpu_vlm_pipeline.py new file mode 100644 index 00000000..254fb571 --- /dev/null +++ b/docs/examples/gpu_vlm_pipeline.py @@ -0,0 +1,94 @@ +import datetime +import logging +import time +from pathlib import Path + +import numpy as np +from pydantic import TypeAdapter + +from docling.datamodel import vlm_model_specs +from docling.datamodel.base_models import ConversionStatus, InputFormat +from docling.datamodel.pipeline_options import ( + VlmPipelineOptions, +) +from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat +from docling.datamodel.settings import settings +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.pipeline.vlm_pipeline import VlmPipeline +from docling.utils.profiling import ProfilingItem + +_log = logging.getLogger(__name__) + + +def main(): + logging.getLogger("docling").setLevel(logging.WARNING) + _log.setLevel(logging.INFO) + + BATCH_SIZE = 64 + + settings.perf.page_batch_size = BATCH_SIZE + settings.debug.profile_pipeline_timings = True + + data_folder = Path(__file__).parent / "../../tests/data" + # input_doc_path = data_folder / "pdf" / "2305.03393v1.pdf" # 14 pages + input_doc_path = data_folder / "pdf" / "redp5110_sampled.pdf" # 18 pages + + vlm_options = ApiVlmOptions( + url="http://localhost:8000/v1/chat/completions", # LM studio defaults to port 1234, VLLM to 8000 + params=dict( + model=vlm_model_specs.GRANITEDOCLING_TRANSFORMERS.repo_id, + max_tokens=4096, + skip_special_tokens=True, + ), + prompt=vlm_model_specs.GRANITEDOCLING_TRANSFORMERS.prompt, + timeout=90, + scale=2.0, + temperature=0.0, + concurrency=BATCH_SIZE, + stop_strings=["", "<|end_of_text|>"], + response_format=ResponseFormat.DOCTAGS, + ) + + pipeline_options = VlmPipelineOptions( + vlm_options=vlm_options, + enable_remote_services=True, # required when using a remote inference service. + ) + + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ), + } + ) + + start_time = time.time() + doc_converter.initialize_pipeline(InputFormat.PDF) + end_time = time.time() - start_time + _log.info(f"Pipeline initialized in {end_time:.2f} seconds.") + + now = datetime.datetime.now() + conv_result = doc_converter.convert(input_doc_path) + assert conv_result.status == ConversionStatus.SUCCESS + + num_pages = len(conv_result.pages) + pipeline_runtime = conv_result.timings["pipeline_total"].times[0] + _log.info(f"Document converted in {pipeline_runtime:.2f} seconds.") + _log.info(f" [efficiency]: {num_pages / pipeline_runtime:.2f} pages/second.") + for stage in ("page_init", "vlm"): + values = np.array(conv_result.timings[stage].times) + _log.info( + f" [{stage}]: {np.min(values):.2f} / {np.median(values):.2f} / {np.max(values):.2f} seconds/page" + ) + + TimingsT = TypeAdapter(dict[str, ProfilingItem]) + timings_file = Path(f"result-timings-gpu-vlm-{now:%Y-%m-%d_%H-%M-%S}.json") + with timings_file.open("wb") as fp: + r = TimingsT.dump_json(conv_result.timings, indent=2) + fp.write(r) + _log.info(f"Profile details in {timings_file}.") + + +if __name__ == "__main__": + main() diff --git a/docs/usage/gpu.md b/docs/usage/gpu.md new file mode 100644 index 00000000..7ca1576e --- /dev/null +++ b/docs/usage/gpu.md @@ -0,0 +1,128 @@ +# GPU support + +## Achieving Optimal GPU Performance with Docling + +This guide describes how to maximize GPU performance for Docling pipelines. It covers device selection, pipeline differences, and provides example snippets for configuring batch size and concurrency in the VLM pipeline for both Linux and Windows. + +!!! note + + Improvements and optimizations strategies for maximizing the GPU performance is an + active topic. Regularly check these guidelines for updates. + + +### Standard Pipeline + +Enable GPU acceleration by configuring the accelerator device and concurrency options using Docling's API: + +```python +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions + +# Configure accelerator options for GPU +accelerator_options = AcceleratorOptions( + device=AcceleratorDevice.CUDA, # or AcceleratorDevice.AUTO +) +``` + +Batch size and concurrency for document processing are controlled for each stage of the pipeline as: + +```python +from docling.datamodel.pipeline_options import ( + ThreadedPdfPipelineOptions, +) + +pipeline_options = ThreadedPdfPipelineOptions( + ocr_batch_size=64, # default 4 + layout_batch_size=64, # default 4 + table_batch_size=4, # currently not using GPU batching +) +``` + +Setting a higher `page_batch_size` will run the Docling models (in particular the layout detection stage) with a GPU batch inference mode. + +#### Complete example + +For a complete example see [gpu_standard_pipeline.py](../examples/gpu_standard_pipeline.py). + + +### VLM Pipeline + +For best GPU utilization, use a local inference server. Docling supports inference servers which exposes the OpenAI-compatible chat completion endpoints. For example: + +- vllm: `http://localhost:8000/v1/chat/completions` (available only on Linux) +- LM Studio: `http://localhost:1234/v1/chat/completions` (available both on Linux and Windows) +- Ollama: `http://localhost:11434/v1/chat/completions` (available both on Linux and Windows) + + +#### Start the inference server + +Here is an example on how to start the [vllm](https://docs.vllm.ai/) inference server with optimum parameters for Granite Docling. + +```sh +vllm serve ibm-granite/granite-docling-258M \ + --host 127.0.0.1 --port 8000 \ + --max-num-seqs 512 \ + --max-num-batched-tokens 8192 \ + --enable-chunked-prefill \ + --gpu-memory-utilization 0.9 +``` + +#### Configure Docling + +Configure the VLM pipeline using Docling's VLM options: + +```python +from docling.datamodel.pipeline_options import VlmPipelineOptions + +vlm_options = VlmPipelineOptions( + enable_remote_services=True, + vlm_options={ + "url": "http://localhost:8000/v1/chat/completions", # or any other compatible endpoint + "params": { + "model": "ibm-granite/granite-docling-258M", + "max_tokens": 4096, + }, + "concurrency": 64, # default is 1 + "prompt": "Convert this page to docling.", + "timeout": 90, + } +) +``` + +Additionally to the concurrency, we also have to set the `page_batch_size` Docling parameter. Make sure to set `settings.perf.page_batch_size >= vlm_options.concurrency`. + +```python +from docling.datamodel.settings import settings + +settings.perf.page_batch_size = 64 # default is 4 +``` + +#### Complete example + +For a complete example see [gpu_vlm_pipeline.py](../examples/gpu_vlm_pipeline.py). + + +#### Available models + +Both LM Studio and Ollama rely on llama.cpp as runtime engine. For using this engine, models have to be converted to the gguf format. + +Here is a list of known models which are available in gguf format and how to use them. + +TBA. + +## Performance results + +Test data: +- Number of pages: 192 +- Number of tables: 95 + +Test infrastructure: +- Instance type: `g6e.2xlarge` +- CPU: 8 vCPUs, AMD EPYC 7R13 +- RAM: 64GB +- GPU: NVIDIA L40S 48GB +- CUDA Version: 13.0, Driver Version: 580.95.05 + +| Pipeline | Page efficiency | +| - | - | +| Standard - Inline | 3.1 pages/second | +| VLM - Inference server (GraniteDocling) | 2.4 pages/second | diff --git a/mkdocs.yml b/mkdocs.yml index 0cf11183..204a64da 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -64,6 +64,7 @@ nav: - Supported formats: usage/supported_formats.md - Enrichment features: usage/enrichments.md - Vision models: usage/vision_models.md + - GPU support: usage/gpu.md - MCP server: usage/mcp.md - Jobkit: usage/jobkit.md - FAQ: @@ -116,6 +117,9 @@ nav: - "Figure enrichment": examples/develop_picture_enrichment.py - "Formula enrichment": examples/develop_formula_understanding.py - "Enrich a DoclingDocument": examples/enrich_doclingdocument.py + - ⚡️ GPU optimization: + - "Standard pipeline": examples/gpu_standard_pipeline.py + - "VLM pipeline": examples/gpu_vlm_pipeline.py - 🗂️ More examples: - examples/dpk-ingest-chunk-tokenize.ipynb - examples/rag_azuresearch.ipynb