docs: Describe examples (#2262)

* Update .py examples with clearer guidance,
update out of date imports and calls

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>

* Fix minimal.py string error, fix ruff format error

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>

* fix more CI issues

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>

---------

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
This commit is contained in:
Mingxuan Zhao
2025-09-16 10:00:38 -04:00
committed by GitHub
parent 0e95171dd6
commit ff351fd40c
21 changed files with 608 additions and 85 deletions

View File

@@ -1,3 +1,33 @@
"""
Batch convert multiple PDF files and export results in several formats.
What this example does
- Loads a small set of sample PDFs.
- Runs the Docling PDF pipeline once per file.
- Writes outputs to `scratch/` in multiple formats (JSON, HTML, Markdown, text, doctags, YAML).
Prerequisites
- Install Docling and dependencies as described in the repository README.
- Ensure you can import `docling` from your Python environment.
# - YAML export requires `PyYAML` (`pip install pyyaml`).
Input documents
- By default, this example uses a few PDFs from `tests/data/pdf/` in the repo.
- If you cloned without test data, or want to use your own files, edit
`input_doc_paths` below to point to PDFs on your machine.
Output formats (controlled by flags)
- `USE_V2 = True` enables the current Docling document exports (recommended).
- `USE_LEGACY = False` keeps legacy Deep Search exports disabled.
You can set it to `True` if you need legacy formats for compatibility tests.
Notes
- Set `pipeline_options.generate_page_images = True` to include page images in HTML.
- The script logs conversion progress and raises if any documents fail.
# - This example shows both helper methods like `save_as_*` and lower-level
# `export_to_*` + manual file writes; outputs may overlap intentionally.
"""
import json
import logging
import time
@@ -15,6 +45,9 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
# Export toggles:
# - USE_V2 controls modern Docling document exports.
# - USE_LEGACY enables legacy Deep Search exports for comparison or migration.
USE_V2 = True
USE_LEGACY = False
@@ -35,6 +68,9 @@ def export_documents(
doc_filename = conv_res.input.file.stem
if USE_V2:
# Recommended modern Docling exports. These helpers mirror the
# lower-level "export_to_*" methods used below, but handle
# common details like image handling.
conv_res.document.save_as_json(
output_dir / f"{doc_filename}.json",
image_mode=ImageRefMode.PLACEHOLDER,
@@ -121,6 +157,9 @@ def export_documents(
def main():
logging.basicConfig(level=logging.INFO)
# Location of sample PDFs used by this example. If your checkout does not
# include test data, change `data_folder` or point `input_doc_paths` to
# your own files.
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_paths = [
data_folder / "pdf/2206.01062.pdf",
@@ -139,6 +178,8 @@ def main():
# settings.debug.visualize_tables = True
# settings.debug.visualize_cells = True
# Configure the PDF pipeline. Enabling page image generation improves HTML
# previews (embedded images) but adds processing time.
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True
@@ -152,11 +193,14 @@ def main():
start_time = time.time()
# Convert all inputs. Set `raises_on_error=False` to keep processing other
# files even if one fails; errors are summarized after the run.
conv_results = doc_converter.convert_all(
input_doc_paths,
raises_on_error=False, # to let conversion run through all and examine results at the end
)
success_count, partial_success_count, failure_count = export_documents(
# Write outputs to ./scratch and log a summary.
_success_count, _partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("scratch")
)

View File

@@ -1,8 +1,28 @@
# Compare VLM models
# ==================
# %% [markdown]
# Compare different VLM models by running the VLM pipeline and timing outputs.
#
# This example runs the VLM pipeline with different vision-language models.
# Their runtime as well output quality is compared.
# What this example does
# - Iterates through a list of VLM model configurations and converts the same file.
# - Prints per-page generation times and saves JSON/MD/HTML to `scratch/`.
# - Summarizes total inference time and pages processed in a table.
#
# Requirements
# - Install `tabulate` for pretty printing (`pip install tabulate`).
#
# Prerequisites
# - Install Docling with VLM extras. Ensure models can be downloaded or are available.
#
# How to run
# - From the repo root: `python docs/examples/compare_vlm_models.py`.
# - Results are saved to `scratch/` with filenames including the model and framework.
#
# Notes
# - MLX models are skipped automatically on non-macOS platforms.
# - On CUDA systems, you can enable flash_attention_2 (see commented lines).
# - Running multiple VLMs can be GPU/CPU intensive and time-consuming; ensure
# enough VRAM/system RAM and close other memory-heavy apps.
# %%
import json
import sys
@@ -31,6 +51,8 @@ from docling.pipeline.vlm_pipeline import VlmPipeline
def convert(sources: list[Path], converter: DocumentConverter):
# Note: this helper assumes a single-item `sources` list. It returns after
# processing the first source to keep runtime/output focused.
model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
framework = pipeline_options.vlm_options.inference_framework
for source in sources:
@@ -61,6 +83,8 @@ def convert(sources: list[Path], converter: DocumentConverter):
print("===== Final output of the converted document =======")
# Manual export for illustration. Below, `save_as_json()` writes the same
# JSON again; kept intentionally to show both approaches.
with (out_path / f"{fname}.json").open("w") as fp:
fp.write(json.dumps(res.document.export_to_dict()))

View File

@@ -1,3 +1,39 @@
# %% [markdown]
# Customize PDF conversion by toggling OCR/backends and pipeline options.
#
# What this example does
# - Shows several alternative configurations for the Docling PDF pipeline.
# - Lets you try OCR engines (EasyOCR, Tesseract, system OCR) or no OCR.
# - Converts a single sample PDF and exports results to `scratch/`.
#
# Prerequisites
# - Install Docling and its optional OCR backends per the docs.
# - Ensure you can import `docling` from your Python environment.
#
# How to run
# - From the repository root, run: `python docs/examples/custom_convert.py`.
# - Outputs are written under `scratch/` next to where you run the script.
#
# Choosing a configuration
# - Only one configuration block should be active at a time.
# - Uncomment exactly one of the sections below to experiment.
# - The file ships with "Docling Parse with EasyOCR" enabled as a sensible default.
# - If you uncomment a backend or OCR option that is not imported above, also
# import its class, e.g.:
# - `from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend`
# - `from docling.datamodel.pipeline_options import TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions`
#
# Input document
# - Defaults to a single PDF from `tests/data/pdf/` in the repo.
# - If you don't have the test data, update `input_doc_path` to a local PDF.
#
# Notes
# - EasyOCR language: adjust `pipeline_options.ocr_options.lang` (e.g., ["en"], ["es"], ["en", "de"]).
# - Accelerators: tune `AcceleratorOptions` to select CPU/GPU or threads.
# - Exports: JSON, plain text, Markdown, and doctags are saved in `scratch/`.
# %%
import json
import logging
import time
@@ -21,9 +57,8 @@ def main():
###########################################################################
# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.
# The sections below demo combinations of PdfPipelineOptions and backends.
# Tip: Uncomment exactly one section at a time to compare outputs.
# PyPdfium without EasyOCR
# --------------------
@@ -68,8 +103,10 @@ def main():
# }
# )
# Docling Parse with EasyOCR
# ----------------------
# Docling Parse with EasyOCR (default)
# -------------------------------
# Enables OCR and table structure with EasyOCR, using automatic device
# selection via AcceleratorOptions. Adjust languages as needed.
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
@@ -86,7 +123,7 @@ def main():
)
# Docling Parse with EasyOCR (CPU only)
# ----------------------
# -------------------------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.ocr_options.use_gpu = False # <-- set this.
@@ -100,7 +137,7 @@ def main():
# )
# Docling Parse with Tesseract
# ----------------------
# ----------------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
@@ -114,7 +151,7 @@ def main():
# )
# Docling Parse with Tesseract CLI
# ----------------------
# --------------------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
@@ -127,8 +164,8 @@ def main():
# }
# )
# Docling Parse with ocrmac(Mac only)
# ----------------------
# Docling Parse with ocrmac (macOS only)
# --------------------------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
@@ -154,13 +191,13 @@ def main():
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem
# Export Deep Search document JSON format:
# Export Docling document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
fp.write(json.dumps(conv_result.document.export_to_dict()))
# Export Text format:
# Export Text format (plain text via Markdown export):
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_text())
fp.write(conv_result.document.export_to_markdown(strict_text=True))
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:

View File

@@ -1,6 +1,21 @@
# WARNING
# This example demonstrates only how to develop a new enrichment model.
# It does not run the actual formula understanding model.
# %% [markdown]
# Developing an enrichment model example (formula understanding: scaffold only).
#
# What this example does
# - Shows how to define pipeline options, an enrichment model, and extend a pipeline.
# - Displays cropped images of formula items and yields them back unchanged.
#
# Important
# - This is a development scaffold; it does not run a real formula understanding model.
#
# How to run
# - From the repo root: `python docs/examples/develop_formula_understanding.py`.
#
# Notes
# - Set `do_formula_understanding=True` to enable the example enrichment stage.
# - Extends `StandardPdfPipeline` and keeps the backend when enrichment is enabled.
# %%
import logging
from collections.abc import Iterable
@@ -42,6 +57,8 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel
return
for enrich_element in element_batch:
# Opens a window for each cropped formula image; comment this out when
# running headless or processing many items to avoid blocking spam.
enrich_element.image.show()
yield enrich_element.item

View File

@@ -1,6 +1,21 @@
# WARNING
# This example demonstrates only how to develop a new enrichment model.
# It does not run the actual picture classifier model.
# %% [markdown]
# Developing a picture enrichment model (classifier scaffold only).
#
# What this example does
# - Demonstrates how to implement an enrichment model that annotates pictures.
# - Adds a dummy PictureClassificationData entry to each PictureItem.
#
# Important
# - This is a scaffold for development; it does not run a real classifier.
#
# How to run
# - From the repo root: `python docs/examples/develop_picture_enrichment.py`.
#
# Notes
# - Enables picture image generation and sets `images_scale` to improve crops.
# - Extends `StandardPdfPipeline` with a custom enrichment stage.
# %%
import logging
from collections.abc import Iterable
@@ -43,7 +58,7 @@ class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):
assert isinstance(element, PictureItem)
# uncomment this to interactively visualize the image
# element.get_image(doc).show()
# element.get_image(doc).show() # may block; avoid in headless runs
element.annotations.append(
PictureClassificationData(

View File

@@ -1,6 +1,26 @@
## Enrich DoclingDocument
# This example allows to run Docling enrichment models on documents which have been already converted
# and stored as serialized DoclingDocument JSON files.
# %% [markdown]
# Enrich an existing DoclingDocument JSON with a custom model (post-conversion).
#
# What this example does
# - Loads a previously converted DoclingDocument from JSON (no reconversion).
# - Uses a backend to crop images for items and runs an enrichment model in batches.
# - Prints a few example annotations to stdout.
#
# Prerequisites
# - A DoclingDocument JSON produced by another conversion (path configured below).
# - Install Docling and dependencies for the chosen enrichment model.
# - Ensure the JSON and the referenced PDF match (same document/version), so
# provenance bounding boxes line up for accurate cropping.
#
# How to run
# - From the repo root: `python docs/examples/enrich_doclingdocument.py`.
# - Adjust `input_doc_path` and `input_pdf_path` if your data is elsewhere.
#
# Notes
# - `BATCH_SIZE` controls how many elements are passed to the model at once.
# - `prepare_element()` crops context around elements based on the model's expansion.
# %%
### Load modules
@@ -24,6 +44,7 @@ from docling.utils.utils import chunkify
### Define batch size used for processing
BATCH_SIZE = 4
# Trade-off: larger batches improve throughput but increase memory usage.
### From DocItem to the model inputs
# The following function is responsible for taking an item and applying the required pre-processing for the model.

View File

@@ -1,3 +1,29 @@
# %% [markdown]
# Export page, figure, and table images from a PDF and save rich outputs.
#
# What this example does
# - Converts a PDF, keeps page/element images, and writes them to `scratch/`.
# - Exports Markdown and HTML with either embedded or referenced images.
#
# Prerequisites
# - Install Docling and image dependencies. Pillow is used for image saves
# (`pip install pillow`) if not already available via Docling's deps.
# - Ensure you can import `docling` from your Python environment.
#
# How to run
# - From the repo root: `python docs/examples/export_figures.py`.
# - Outputs (PNG, MD, HTML) are written to `scratch/`.
#
# Key options
# - `IMAGE_RESOLUTION_SCALE`: increase to render higher-resolution images (e.g., 2.0).
# - `PdfPipelineOptions.generate_page_images`/`generate_picture_images`: preserve images for export.
# - `ImageRefMode`: choose `EMBEDDED` or `REFERENCED` when saving Markdown/HTML.
#
# Input document
# - Defaults to `tests/data/pdf/2206.01062.pdf`. Change `input_doc_path` as needed.
# %%
import logging
import time
from pathlib import Path
@@ -20,12 +46,9 @@ def main():
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
# The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
# with the image field
# Keep page/element images so they can be exported. The `images_scale` controls
# the rendered image resolution (scale=1 ~ 72 DPI). The `generate_*` toggles
# decide which elements are enriched with images.
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True

View File

@@ -1,3 +1,34 @@
# %% [markdown]
# Export multimodal page data (image bytes, text, segments) to a Parquet file.
#
# What this example does
# - Converts a PDF and assembles per-page multimodal records: image, cells, text, segments.
# - Normalizes records to a pandas DataFrame and writes a timestamped `.parquet` in `scratch/`.
#
# Prerequisites
# - Install Docling and `pandas`. Optional: `datasets` and `Pillow` for the commented demo.
#
# How to run
# - From the repo root: `python docs/examples/export_multimodal.py`.
# - Output parquet is written to `scratch/`.
#
# Key options
# - `IMAGE_RESOLUTION_SCALE`: page rendering scale (1 ~ 72 DPI).
# - `PdfPipelineOptions.generate_page_images`: keep page images for export.
#
# Requirements
# - Writing Parquet requires an engine such as `pyarrow` or `fastparquet`
# (`pip install pyarrow` is the most common choice).
#
# Input document
# - Defaults to `tests/data/pdf/2206.01062.pdf`. Change `input_doc_path` as needed.
#
# Notes
# - The commented block at the bottom shows how to load the Parquet with HF Datasets
# and reconstruct images from raw bytes.
# %%
import datetime
import logging
import time
@@ -23,10 +54,8 @@ def main():
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
# Keep page images so they can be exported to the multimodal rows.
# Use PdfPipelineOptions.images_scale to control the render scale (1 ~ 72 DPI).
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True

View File

@@ -1,3 +1,27 @@
# %% [markdown]
# Extract tables from a PDF and export them as CSV and HTML.
#
# What this example does
# - Converts a PDF and iterates detected tables.
# - Prints each table as Markdown to stdout, and saves CSV/HTML to `scratch/`.
#
# Prerequisites
# - Install Docling and `pandas`.
#
# How to run
# - From the repo root: `python docs/examples/export_tables.py`.
# - Outputs are written to `scratch/`.
#
# Input document
# - Defaults to `tests/data/pdf/2206.01062.pdf`. Change `input_doc_path` as needed.
#
# Notes
# - `table.export_to_dataframe()` returns a pandas DataFrame for convenient export/processing.
# - Printing via `DataFrame.to_markdown()` may require the optional `tabulate` package
# (`pip install tabulate`). If unavailable, skip the print or use `to_csv()`.
# %%
import logging
import time
from pathlib import Path
@@ -32,12 +56,12 @@ def main():
print(f"## Table {table_ix}")
print(table_df.to_markdown())
# Save the table as csv
# Save the table as CSV
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
# Save the table as HTML
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:

View File

@@ -1,3 +1,31 @@
# %% [markdown]
# Force full-page OCR on a PDF using different OCR backends.
#
# What this example does
# - Enables full-page OCR and table structure extraction for a sample PDF.
# - Demonstrates how to switch between OCR backends via `ocr_options`.
#
# Prerequisites
# - Install Docling and the desired OCR backend's dependencies (Tesseract, EasyOCR,
# RapidOCR, or macOS OCR).
#
# How to run
# - From the repo root: `python docs/examples/full_page_ocr.py`.
# - The script prints Markdown text to stdout.
#
# Choosing an OCR backend
# - Uncomment one `ocr_options = ...` line below. Exactly one should be active.
# - `force_full_page_ocr=True` processes each page purely via OCR (often slower
# than hybrid detection). Use when layout extraction is unreliable or the PDF
# contains scanned pages.
# - If you switch OCR backends, ensure the corresponding option class is imported,
# e.g., `EasyOcrOptions`, `TesseractOcrOptions`, `OcrMacOptions`, `RapidOcrOptions`.
#
# Input document
# - Defaults to `tests/data/pdf/2206.01062.pdf`. Change `input_doc_path` as needed.
# %%
from pathlib import Path
from docling.datamodel.base_models import InputFormat
@@ -17,7 +45,8 @@ def main():
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions
# Any of the OCR options can be used: EasyOcrOptions, TesseractOcrOptions,
# TesseractCliOcrOptions, OcrMacOptions (macOS only), RapidOcrOptions
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
# ocr_options = OcrMacOptions(force_full_page_ocr=True)

View File

@@ -1,12 +1,30 @@
# %% [markdown]
# Inspect the contents associated with each picture in a converted document.
#
# What this example does
# - Converts a PDF and iterates over each PictureItem.
# - Prints the caption and the textual items contained within the picture region.
#
# How to run
# - From the repo root: `python docs/examples/inspect_picture_content.py`.
#
# Notes
# - Uncomment `picture.get_image(doc).show()` to visually inspect each picture.
# - Adjust `source` to point to a different PDF if desired.
# %%
from docling_core.types.doc import TextItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
# Change this to a local path if desired
source = "tests/data/pdf/amt_handbook_sample.pdf"
pipeline_options = PdfPipelineOptions()
# Higher scale yields sharper crops when inspecting picture content.
pipeline_options.images_scale = 2
pipeline_options.generate_page_images = True
@@ -19,7 +37,7 @@ result = doc_converter.convert(source)
doc = result.document
for picture in doc.pictures:
# picture.get_image(doc).show() # display the picture
# picture.get_image(doc).show() # display the picture
print(picture.caption_text(doc), " contains these elements:")
for item, level in doc.iterate_items(root=picture, traverse_pictures=True):

View File

@@ -1,9 +1,32 @@
# %% [markdown]
# Simple conversion: one document to Markdown
# ==========================================
#
# What this example does
# - Converts a single source (URL or local file path) to a unified Docling
# document and prints Markdown to stdout.
#
# Requirements
# - Python 3.9+
# - Install Docling: `pip install docling`
#
# How to run
# - Use the default sample URL: `python docs/examples/minimal.py`
# - To use your own file or URL, edit the `source` variable below.
#
# Notes
# - The converter auto-detects supported formats (PDF, DOCX, HTML, PPTX, images, etc.).
# - For batch processing or saving outputs to files, see `docs/examples/batch_convert.py`.
from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
# Change this to a local path or another URL if desired.
# Note: using the default URL requires network access; if offline, provide a
# local file path (e.g., Path("/path/to/file.pdf")).
source = "https://arxiv.org/pdf/2408.09869"
converter = DocumentConverter()
doc = converter.convert(source).document
result = converter.convert(source)
print(doc.export_to_markdown())
# output: ## Docling Technical Report [...]"
# Print Markdown to stdout.
print(result.document.export_to_markdown())

View File

@@ -1,3 +1,28 @@
# %% [markdown]
# Minimal ASR pipeline example: transcribe an audio file to Markdown text.
#
# What this example does
# - Configures the ASR pipeline with a default model spec and converts one audio file.
# - Prints the recognized speech segments in Markdown with timestamps.
#
# Prerequisites
# - Install Docling with ASR extras and any audio dependencies (ffmpeg, etc.).
# - Ensure your environment can download or access the configured ASR model.
# - Some formats require ffmpeg codecs; install ffmpeg and ensure it's on PATH.
#
# How to run
# - From the repository root, run: `python docs/examples/minimal_asr_pipeline.py`.
# - The script prints the transcription to stdout.
#
# Customizing the model
# - Edit `get_asr_converter()` to switch `asr_model_specs` (e.g., language or model size).
# - Keep `InputFormat.AUDIO` and `AsrPipeline` unchanged for a minimal setup.
#
# Input audio
# - Defaults to `tests/data/audio/sample_10s.mp3`. Update `audio_path` to your own file if needed.
# %%
from pathlib import Path
from docling_core.types.doc import DoclingDocument
@@ -11,7 +36,11 @@ from docling.pipeline.asr_pipeline import AsrPipeline
def get_asr_converter():
"""Create a DocumentConverter configured for ASR with whisper_turbo model."""
"""Create a DocumentConverter configured for ASR with a default model.
Uses `asr_model_specs.WHISPER_TURBO` by default. You can swap in another
model spec from `docling.datamodel.asr_model_specs` to experiment.
"""
pipeline_options = AsrPipelineOptions()
pipeline_options.asr_options = asr_model_specs.WHISPER_TURBO
@@ -27,7 +56,7 @@ def get_asr_converter():
def asr_pipeline_conversion(audio_path: Path) -> DoclingDocument:
"""ASR pipeline conversion using whisper_turbo"""
"""Run the ASR pipeline and return a `DoclingDocument` transcript."""
# Check if the test audio file exists
assert audio_path.exists(), f"Test audio file not found: {audio_path}"

View File

@@ -1,3 +1,25 @@
# %% [markdown]
# Minimal VLM pipeline example: convert a PDF using a vision-language model.
#
# What this example does
# - Runs the VLM-powered pipeline on a PDF (by URL) and prints Markdown output.
# - Shows two setups: default (Transformers/SmolDocling) and macOS MPS/MLX.
#
# Prerequisites
# - Install Docling with VLM extras and the appropriate backend (Transformers or MLX).
# - Ensure your environment can download model weights (e.g., from Hugging Face).
#
# How to run
# - From the repository root, run: `python docs/examples/minimal_vlm_pipeline.py`.
# - The script prints the converted Markdown to stdout.
#
# Notes
# - `source` may be a local path or a URL to a PDF.
# - The second section demonstrates macOS MPS acceleration via MLX (`vlm_model_specs.SMOLDOCLING_MLX`).
# - For more configurations and model comparisons, see `docs/examples/compare_vlm_models.py`.
# %%
from docling.datamodel import vlm_model_specs
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
@@ -6,6 +28,7 @@ from docling.datamodel.pipeline_options import (
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
# Convert a public arXiv PDF; replace with a local path if preferred.
source = "https://arxiv.org/pdf/2501.17887"
###### USING SIMPLE DEFAULT VALUES
@@ -26,7 +49,8 @@ print(doc.export_to_markdown())
###### USING MACOS MPS ACCELERATOR
# For more options see the compare_vlm_models.py example.
# Demonstrates using MLX on macOS with MPS acceleration (macOS only).
# For more options see the `compare_vlm_models.py` example.
pipeline_options = VlmPipelineOptions(
vlm_options=vlm_model_specs.SMOLDOCLING_MLX,

View File

@@ -1,3 +1,27 @@
# %% [markdown]
# Describe pictures using a remote VLM API (vLLM, LM Studio, or watsonx.ai).
#
# What this example does
# - Configures `PictureDescriptionApiOptions` for local or cloud providers.
# - Converts a PDF, then prints each picture's caption and annotations.
#
# Prerequisites
# - Install Docling and `python-dotenv` if loading env vars from a `.env` file.
# - For local providers: ensure vLLM or LM Studio is running.
# - For watsonx.ai: set `WX_API_KEY` and `WX_PROJECT_ID` in the environment.
#
# How to run
# - From the repo root: `python docs/examples/pictures_description_api.py`.
# - Uncomment exactly one provider config and set `enable_remote_services=True` (already set).
#
# Notes
# - vLLM default endpoint: `http://localhost:8000/v1/chat/completions`.
# - LM Studio default endpoint: `http://localhost:1234/v1/chat/completions`.
# - Calling remote APIs sends page images/text to the provider; review privacy and
# costs. For local testing, LM Studio runs everything on your machine.
# %%
import logging
import os
from pathlib import Path

View File

@@ -1,25 +1,46 @@
# %% [markdown]
# Use RapidOCR with custom ONNX models to OCR a PDF page and print Markdown.
#
# What this example does
# - Downloads RapidOCR models from Hugging Face via ModelScope.
# - Configures `RapidOcrOptions` with explicit det/rec/cls model paths.
# - Runs the PDF pipeline with RapidOCR and prints Markdown output.
#
# Prerequisites
# - Install Docling, `modelscope`, and have network access to download models.
# - Ensure your environment can import `docling` and `modelscope`.
#
# How to run
# - From the repo root: `python docs/examples/rapidocr_with_custom_models.py`.
# - The script prints the recognized text as Markdown to stdout.
#
# Notes
# - The default `source` points to an arXiv PDF URL; replace with a local path if desired.
# - Model paths are derived from the downloaded snapshot directory.
# - ModelScope caches downloads (typically under `~/.cache/modelscope`); set a proxy
# or pre-download models if running in a restricted network environment.
# %%
import os
from modelscope import snapshot_download
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
from docling.document_converter import (
ConversionResult,
DocumentConverter,
InputFormat,
PdfFormatOption,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
# Source document to convert
source = "https://arxiv.org/pdf/2408.09869v4"
# Download RappidOCR models from HuggingFace
# Download RapidOCR models from Hugging Face
print("Downloading RapidOCR models")
download_path = snapshot_download(repo_id="RapidAI/RapidOCR")
# Setup RapidOcrOptions for english detection
# Setup RapidOcrOptions for English detection
det_model_path = os.path.join(
download_path, "onnx", "PP-OCRv5", "det", "ch_PP-OCRv5_server_det.onnx"
)

View File

@@ -1,3 +1,22 @@
# %% [markdown]
# Run conversion with an explicit accelerator configuration (CPU/MPS/CUDA).
#
# What this example does
# - Shows how to select the accelerator device and thread count.
# - Enables OCR and table structure to exercise compute paths, and prints timings.
#
# How to run
# - From the repo root: `python docs/examples/run_with_accelerator.py`.
# - Toggle the commented `AcceleratorOptions` examples to try AUTO/MPS/CUDA.
#
# Notes
# - EasyOCR does not support `cuda:N` device selection (defaults to `cuda:0`).
# - `settings.debug.profile_pipeline_timings = True` prints profiling details.
# - `AcceleratorDevice.MPS` is macOS-only; `CUDA` requires a compatible GPU and
# CUDA-enabled PyTorch build. CPU mode works everywhere.
# %%
from pathlib import Path
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions

View File

@@ -1,3 +1,32 @@
# %% [markdown]
# Run conversion across multiple input formats and customize handling per type.
#
# What this example does
# - Demonstrates converting a mixed list of files (PDF, DOCX, PPTX, HTML, images, etc.).
# - Shows how to restrict `allowed_formats` and override `format_options` per format.
# - Writes results (Markdown, JSON, YAML) to `scratch/`.
#
# Prerequisites
# - Install Docling and any format-specific dependencies (e.g., for DOCX/PPTX parsing).
# - Ensure you can import `docling` from your Python environment.
# - YAML export requires `PyYAML` (`pip install pyyaml`).
#
# How to run
# - From the repository root, run: `python docs/examples/run_with_formats.py`.
# - Outputs are written under `scratch/` next to where you run the script.
# - If `scratch/` does not exist, create it before running.
#
# Customizing inputs
# - Update `input_paths` to include or remove files on your machine.
# - Non-whitelisted formats are ignored (see `allowed_formats`).
#
# Notes
# - `allowed_formats`: explicit whitelist of formats that will be processed.
# - `format_options`: per-format pipeline/backend overrides. Everything is optional; defaults exist.
# - Exports: per input, writes `<stem>.md`, `<stem>.json`, and `<stem>.yaml` in `scratch/`.
# %%
import json
import logging
from pathlib import Path
@@ -34,39 +63,41 @@ def main():
## to customize use:
doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
InputFormat.ASCIIDOC,
InputFormat.CSV,
InputFormat.MD,
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
),
},
)
# Below we explicitly whitelist formats and override behavior for some of them.
# You can omit this block and use the defaults (see above) for a quick start.
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
InputFormat.ASCIIDOC,
InputFormat.CSV,
InputFormat.MD,
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline # or set a backend, e.g., MsWordDocumentBackend
# If you change the backend, remember to import it, e.g.:
# from docling.backend.msword_backend import MsWordDocumentBackend
),
},
)
conv_results = doc_converter.convert_all(input_paths)
for res in conv_results:
out_path = Path("scratch")
out_path = Path("scratch") # ensure this directory exists before running
print(
f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {out_path!s}"
)
_log.debug(res.document._export_to_indented_text(max_text_len=16))
# Export Docling document format to markdowndoc:
# Export Docling document to Markdown:
with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
fp.write(res.document.export_to_markdown())

View File

@@ -1,3 +1,22 @@
# %% [markdown]
# Detect language automatically with Tesseract OCR and force full-page OCR.
#
# What this example does
# - Configures Tesseract (CLI in this snippet) with `lang=["auto"]`.
# - Forces full-page OCR and prints the recognized text as Markdown.
#
# How to run
# - From the repo root: `python docs/examples/tesseract_lang_detection.py`.
# - Ensure Tesseract CLI (or library) is installed and on PATH.
#
# Notes
# - You can switch to `TesseractOcrOptions` instead of `TesseractCliOcrOptions`.
# - Language packs must be installed; set `TESSDATA_PREFIX` if Tesseract
# cannot find language data. Using `lang=["auto"]` requires traineddata
# that supports script/language detection on your system.
# %%
from pathlib import Path
from docling.datamodel.base_models import InputFormat

View File

@@ -1,3 +1,23 @@
# %% [markdown]
# Translate extracted text content and regenerate Markdown with embedded images.
#
# What this example does
# - Converts a PDF and saves original Markdown with embedded images.
# - Translates text elements and table cell contents, then saves a translated Markdown.
#
# Prerequisites
# - Install Docling. Add a translation library of your choice inside `translate()`.
#
# How to run
# - From the repo root: `python docs/examples/translate.py`.
# - The script writes original and translated Markdown to `scratch/`.
#
# Notes
# - `translate()` is a placeholder; integrate your preferred translation API/client.
# - Image generation is enabled to preserve embedded images in the output.
# %%
import logging
from pathlib import Path
@@ -32,7 +52,7 @@ def main():
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch")
output_dir = Path("scratch") # ensure this directory exists before saving
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
@@ -56,6 +76,7 @@ def main():
doc_filename = conv_res.input.file.name
# Save markdown with embedded pictures in original text
# Tip: create the `scratch/` folder first or adjust `output_dir`.
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
conv_doc.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

View File

@@ -1,3 +1,31 @@
# %% [markdown]
# Use the VLM pipeline with remote API models (LM Studio, Ollama, watsonx.ai).
#
# What this example does
# - Shows how to configure `ApiVlmOptions` for different VLM providers.
# - Converts a single PDF page using the VLM pipeline and prints Markdown.
#
# Prerequisites
# - Install Docling with VLM extras and `python-dotenv` if using environment files.
# - For local APIs: run LM Studio (HTTP server) or Ollama locally.
# - For cloud APIs: set required environment variables (see below).
# - Requires `requests` for HTTP calls and `python-dotenv` if loading env vars from `.env`.
#
# How to run
# - From the repo root: `python docs/examples/vlm_pipeline_api_model.py`.
# - The script prints the converted Markdown to stdout.
#
# Choosing a provider
# - Uncomment exactly one `pipeline_options.vlm_options = ...` block below.
# - Keep `enable_remote_services=True` to permit calling remote APIs.
#
# Notes
# - LM Studio default endpoint: `http://localhost:1234/v1/chat/completions`.
# - Ollama default endpoint: `http://localhost:11434/v1/chat/completions`.
# - watsonx.ai requires `WX_API_KEY` and `WX_PROJECT_ID` in env/`.env`.
# %%
import json
import logging
import os
@@ -170,14 +198,16 @@ def main():
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2305.03393v1-pg9.pdf"
# Configure the VLM pipeline. Enabling remote services allows HTTP calls to
# locally hosted APIs (LM Studio, Ollama) or cloud services.
pipeline_options = VlmPipelineOptions(
enable_remote_services=True # <-- this is required!
enable_remote_services=True # required when calling remote VLM endpoints
)
# The ApiVlmOptions() allows to interface with APIs supporting
# the multi-modal chat interface. Here follow a few example on how to configure those.
# One possibility is self-hosting model, e.g. via LM Studio, Ollama or others.
# One possibility is self-hosting the model, e.g., via LM Studio or Ollama.
# Example using the SmolDocling model with LM Studio:
# (uncomment the following lines)
@@ -208,8 +238,9 @@ def main():
# prompt="OCR the full page to markdown.",
# )
# Another possibility is using online services, e.g. watsonx.ai.
# Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
# Another possibility is using online services, e.g., watsonx.ai.
# Using watsonx.ai requires setting env variables WX_API_KEY and WX_PROJECT_ID
# (see the top-level docstring for details). You can use a .env file as well.
# (uncomment the following lines)
# pipeline_options.vlm_options = watsonx_vlm_options(
# model="ibm/granite-vision-3-2-2b", prompt="OCR the full page to markdown."