mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
docs: Describe examples (#2262)
* Update .py examples with clearer guidance, update out of date imports and calls Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> * Fix minimal.py string error, fix ruff format error Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> * fix more CI issues Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --------- Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
This commit is contained in:
46
docs/examples/batch_convert.py
vendored
46
docs/examples/batch_convert.py
vendored
@@ -1,3 +1,33 @@
|
||||
"""
|
||||
Batch convert multiple PDF files and export results in several formats.
|
||||
|
||||
What this example does
|
||||
- Loads a small set of sample PDFs.
|
||||
- Runs the Docling PDF pipeline once per file.
|
||||
- Writes outputs to `scratch/` in multiple formats (JSON, HTML, Markdown, text, doctags, YAML).
|
||||
|
||||
Prerequisites
|
||||
- Install Docling and dependencies as described in the repository README.
|
||||
- Ensure you can import `docling` from your Python environment.
|
||||
# - YAML export requires `PyYAML` (`pip install pyyaml`).
|
||||
|
||||
Input documents
|
||||
- By default, this example uses a few PDFs from `tests/data/pdf/` in the repo.
|
||||
- If you cloned without test data, or want to use your own files, edit
|
||||
`input_doc_paths` below to point to PDFs on your machine.
|
||||
|
||||
Output formats (controlled by flags)
|
||||
- `USE_V2 = True` enables the current Docling document exports (recommended).
|
||||
- `USE_LEGACY = False` keeps legacy Deep Search exports disabled.
|
||||
You can set it to `True` if you need legacy formats for compatibility tests.
|
||||
|
||||
Notes
|
||||
- Set `pipeline_options.generate_page_images = True` to include page images in HTML.
|
||||
- The script logs conversion progress and raises if any documents fail.
|
||||
# - This example shows both helper methods like `save_as_*` and lower-level
|
||||
# `export_to_*` + manual file writes; outputs may overlap intentionally.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
@@ -15,6 +45,9 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
# Export toggles:
|
||||
# - USE_V2 controls modern Docling document exports.
|
||||
# - USE_LEGACY enables legacy Deep Search exports for comparison or migration.
|
||||
USE_V2 = True
|
||||
USE_LEGACY = False
|
||||
|
||||
@@ -35,6 +68,9 @@ def export_documents(
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
if USE_V2:
|
||||
# Recommended modern Docling exports. These helpers mirror the
|
||||
# lower-level "export_to_*" methods used below, but handle
|
||||
# common details like image handling.
|
||||
conv_res.document.save_as_json(
|
||||
output_dir / f"{doc_filename}.json",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
@@ -121,6 +157,9 @@ def export_documents(
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Location of sample PDFs used by this example. If your checkout does not
|
||||
# include test data, change `data_folder` or point `input_doc_paths` to
|
||||
# your own files.
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_paths = [
|
||||
data_folder / "pdf/2206.01062.pdf",
|
||||
@@ -139,6 +178,8 @@ def main():
|
||||
# settings.debug.visualize_tables = True
|
||||
# settings.debug.visualize_cells = True
|
||||
|
||||
# Configure the PDF pipeline. Enabling page image generation improves HTML
|
||||
# previews (embedded images) but adds processing time.
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.generate_page_images = True
|
||||
|
||||
@@ -152,11 +193,14 @@ def main():
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Convert all inputs. Set `raises_on_error=False` to keep processing other
|
||||
# files even if one fails; errors are summarized after the run.
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths,
|
||||
raises_on_error=False, # to let conversion run through all and examine results at the end
|
||||
)
|
||||
success_count, partial_success_count, failure_count = export_documents(
|
||||
# Write outputs to ./scratch and log a summary.
|
||||
_success_count, _partial_success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("scratch")
|
||||
)
|
||||
|
||||
|
||||
32
docs/examples/compare_vlm_models.py
vendored
32
docs/examples/compare_vlm_models.py
vendored
@@ -1,8 +1,28 @@
|
||||
# Compare VLM models
|
||||
# ==================
|
||||
# %% [markdown]
|
||||
# Compare different VLM models by running the VLM pipeline and timing outputs.
|
||||
#
|
||||
# This example runs the VLM pipeline with different vision-language models.
|
||||
# Their runtime as well output quality is compared.
|
||||
# What this example does
|
||||
# - Iterates through a list of VLM model configurations and converts the same file.
|
||||
# - Prints per-page generation times and saves JSON/MD/HTML to `scratch/`.
|
||||
# - Summarizes total inference time and pages processed in a table.
|
||||
#
|
||||
# Requirements
|
||||
# - Install `tabulate` for pretty printing (`pip install tabulate`).
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling with VLM extras. Ensure models can be downloaded or are available.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/compare_vlm_models.py`.
|
||||
# - Results are saved to `scratch/` with filenames including the model and framework.
|
||||
#
|
||||
# Notes
|
||||
# - MLX models are skipped automatically on non-macOS platforms.
|
||||
# - On CUDA systems, you can enable flash_attention_2 (see commented lines).
|
||||
# - Running multiple VLMs can be GPU/CPU intensive and time-consuming; ensure
|
||||
# enough VRAM/system RAM and close other memory-heavy apps.
|
||||
|
||||
# %%
|
||||
|
||||
import json
|
||||
import sys
|
||||
@@ -31,6 +51,8 @@ from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
|
||||
def convert(sources: list[Path], converter: DocumentConverter):
|
||||
# Note: this helper assumes a single-item `sources` list. It returns after
|
||||
# processing the first source to keep runtime/output focused.
|
||||
model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
|
||||
framework = pipeline_options.vlm_options.inference_framework
|
||||
for source in sources:
|
||||
@@ -61,6 +83,8 @@ def convert(sources: list[Path], converter: DocumentConverter):
|
||||
|
||||
print("===== Final output of the converted document =======")
|
||||
|
||||
# Manual export for illustration. Below, `save_as_json()` writes the same
|
||||
# JSON again; kept intentionally to show both approaches.
|
||||
with (out_path / f"{fname}.json").open("w") as fp:
|
||||
fp.write(json.dumps(res.document.export_to_dict()))
|
||||
|
||||
|
||||
63
docs/examples/custom_convert.py
vendored
63
docs/examples/custom_convert.py
vendored
@@ -1,3 +1,39 @@
|
||||
# %% [markdown]
|
||||
# Customize PDF conversion by toggling OCR/backends and pipeline options.
|
||||
#
|
||||
# What this example does
|
||||
# - Shows several alternative configurations for the Docling PDF pipeline.
|
||||
# - Lets you try OCR engines (EasyOCR, Tesseract, system OCR) or no OCR.
|
||||
# - Converts a single sample PDF and exports results to `scratch/`.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling and its optional OCR backends per the docs.
|
||||
# - Ensure you can import `docling` from your Python environment.
|
||||
#
|
||||
# How to run
|
||||
# - From the repository root, run: `python docs/examples/custom_convert.py`.
|
||||
# - Outputs are written under `scratch/` next to where you run the script.
|
||||
#
|
||||
# Choosing a configuration
|
||||
# - Only one configuration block should be active at a time.
|
||||
# - Uncomment exactly one of the sections below to experiment.
|
||||
# - The file ships with "Docling Parse with EasyOCR" enabled as a sensible default.
|
||||
# - If you uncomment a backend or OCR option that is not imported above, also
|
||||
# import its class, e.g.:
|
||||
# - `from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend`
|
||||
# - `from docling.datamodel.pipeline_options import TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions`
|
||||
#
|
||||
# Input document
|
||||
# - Defaults to a single PDF from `tests/data/pdf/` in the repo.
|
||||
# - If you don't have the test data, update `input_doc_path` to a local PDF.
|
||||
#
|
||||
# Notes
|
||||
# - EasyOCR language: adjust `pipeline_options.ocr_options.lang` (e.g., ["en"], ["es"], ["en", "de"]).
|
||||
# - Accelerators: tune `AcceleratorOptions` to select CPU/GPU or threads.
|
||||
# - Exports: JSON, plain text, Markdown, and doctags are saved in `scratch/`.
|
||||
|
||||
# %%
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
@@ -21,9 +57,8 @@ def main():
|
||||
|
||||
###########################################################################
|
||||
|
||||
# The following sections contain a combination of PipelineOptions
|
||||
# and PDF Backends for various configurations.
|
||||
# Uncomment one section at the time to see the differences in the output.
|
||||
# The sections below demo combinations of PdfPipelineOptions and backends.
|
||||
# Tip: Uncomment exactly one section at a time to compare outputs.
|
||||
|
||||
# PyPdfium without EasyOCR
|
||||
# --------------------
|
||||
@@ -68,8 +103,10 @@ def main():
|
||||
# }
|
||||
# )
|
||||
|
||||
# Docling Parse with EasyOCR
|
||||
# ----------------------
|
||||
# Docling Parse with EasyOCR (default)
|
||||
# -------------------------------
|
||||
# Enables OCR and table structure with EasyOCR, using automatic device
|
||||
# selection via AcceleratorOptions. Adjust languages as needed.
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.do_table_structure = True
|
||||
@@ -86,7 +123,7 @@ def main():
|
||||
)
|
||||
|
||||
# Docling Parse with EasyOCR (CPU only)
|
||||
# ----------------------
|
||||
# -------------------------------------
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.ocr_options.use_gpu = False # <-- set this.
|
||||
@@ -100,7 +137,7 @@ def main():
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract
|
||||
# ----------------------
|
||||
# ----------------------------
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
@@ -114,7 +151,7 @@ def main():
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract CLI
|
||||
# ----------------------
|
||||
# --------------------------------
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
@@ -127,8 +164,8 @@ def main():
|
||||
# }
|
||||
# )
|
||||
|
||||
# Docling Parse with ocrmac(Mac only)
|
||||
# ----------------------
|
||||
# Docling Parse with ocrmac (macOS only)
|
||||
# --------------------------------------
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
@@ -154,13 +191,13 @@ def main():
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
doc_filename = conv_result.input.file.stem
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
# Export Docling document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(conv_result.document.export_to_dict()))
|
||||
|
||||
# Export Text format:
|
||||
# Export Text format (plain text via Markdown export):
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_result.document.export_to_text())
|
||||
fp.write(conv_result.document.export_to_markdown(strict_text=True))
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
|
||||
23
docs/examples/develop_formula_understanding.py
vendored
23
docs/examples/develop_formula_understanding.py
vendored
@@ -1,6 +1,21 @@
|
||||
# WARNING
|
||||
# This example demonstrates only how to develop a new enrichment model.
|
||||
# It does not run the actual formula understanding model.
|
||||
# %% [markdown]
|
||||
# Developing an enrichment model example (formula understanding: scaffold only).
|
||||
#
|
||||
# What this example does
|
||||
# - Shows how to define pipeline options, an enrichment model, and extend a pipeline.
|
||||
# - Displays cropped images of formula items and yields them back unchanged.
|
||||
#
|
||||
# Important
|
||||
# - This is a development scaffold; it does not run a real formula understanding model.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/develop_formula_understanding.py`.
|
||||
#
|
||||
# Notes
|
||||
# - Set `do_formula_understanding=True` to enable the example enrichment stage.
|
||||
# - Extends `StandardPdfPipeline` and keeps the backend when enrichment is enabled.
|
||||
|
||||
# %%
|
||||
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
@@ -42,6 +57,8 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel
|
||||
return
|
||||
|
||||
for enrich_element in element_batch:
|
||||
# Opens a window for each cropped formula image; comment this out when
|
||||
# running headless or processing many items to avoid blocking spam.
|
||||
enrich_element.image.show()
|
||||
|
||||
yield enrich_element.item
|
||||
|
||||
23
docs/examples/develop_picture_enrichment.py
vendored
23
docs/examples/develop_picture_enrichment.py
vendored
@@ -1,6 +1,21 @@
|
||||
# WARNING
|
||||
# This example demonstrates only how to develop a new enrichment model.
|
||||
# It does not run the actual picture classifier model.
|
||||
# %% [markdown]
|
||||
# Developing a picture enrichment model (classifier scaffold only).
|
||||
#
|
||||
# What this example does
|
||||
# - Demonstrates how to implement an enrichment model that annotates pictures.
|
||||
# - Adds a dummy PictureClassificationData entry to each PictureItem.
|
||||
#
|
||||
# Important
|
||||
# - This is a scaffold for development; it does not run a real classifier.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/develop_picture_enrichment.py`.
|
||||
#
|
||||
# Notes
|
||||
# - Enables picture image generation and sets `images_scale` to improve crops.
|
||||
# - Extends `StandardPdfPipeline` with a custom enrichment stage.
|
||||
|
||||
# %%
|
||||
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
@@ -43,7 +58,7 @@ class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):
|
||||
assert isinstance(element, PictureItem)
|
||||
|
||||
# uncomment this to interactively visualize the image
|
||||
# element.get_image(doc).show()
|
||||
# element.get_image(doc).show() # may block; avoid in headless runs
|
||||
|
||||
element.annotations.append(
|
||||
PictureClassificationData(
|
||||
|
||||
27
docs/examples/enrich_doclingdocument.py
vendored
27
docs/examples/enrich_doclingdocument.py
vendored
@@ -1,6 +1,26 @@
|
||||
## Enrich DoclingDocument
|
||||
# This example allows to run Docling enrichment models on documents which have been already converted
|
||||
# and stored as serialized DoclingDocument JSON files.
|
||||
# %% [markdown]
|
||||
# Enrich an existing DoclingDocument JSON with a custom model (post-conversion).
|
||||
#
|
||||
# What this example does
|
||||
# - Loads a previously converted DoclingDocument from JSON (no reconversion).
|
||||
# - Uses a backend to crop images for items and runs an enrichment model in batches.
|
||||
# - Prints a few example annotations to stdout.
|
||||
#
|
||||
# Prerequisites
|
||||
# - A DoclingDocument JSON produced by another conversion (path configured below).
|
||||
# - Install Docling and dependencies for the chosen enrichment model.
|
||||
# - Ensure the JSON and the referenced PDF match (same document/version), so
|
||||
# provenance bounding boxes line up for accurate cropping.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/enrich_doclingdocument.py`.
|
||||
# - Adjust `input_doc_path` and `input_pdf_path` if your data is elsewhere.
|
||||
#
|
||||
# Notes
|
||||
# - `BATCH_SIZE` controls how many elements are passed to the model at once.
|
||||
# - `prepare_element()` crops context around elements based on the model's expansion.
|
||||
|
||||
# %%
|
||||
|
||||
### Load modules
|
||||
|
||||
@@ -24,6 +44,7 @@ from docling.utils.utils import chunkify
|
||||
### Define batch size used for processing
|
||||
|
||||
BATCH_SIZE = 4
|
||||
# Trade-off: larger batches improve throughput but increase memory usage.
|
||||
|
||||
### From DocItem to the model inputs
|
||||
# The following function is responsible for taking an item and applying the required pre-processing for the model.
|
||||
|
||||
35
docs/examples/export_figures.py
vendored
35
docs/examples/export_figures.py
vendored
@@ -1,3 +1,29 @@
|
||||
# %% [markdown]
|
||||
# Export page, figure, and table images from a PDF and save rich outputs.
|
||||
#
|
||||
# What this example does
|
||||
# - Converts a PDF, keeps page/element images, and writes them to `scratch/`.
|
||||
# - Exports Markdown and HTML with either embedded or referenced images.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling and image dependencies. Pillow is used for image saves
|
||||
# (`pip install pillow`) if not already available via Docling's deps.
|
||||
# - Ensure you can import `docling` from your Python environment.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/export_figures.py`.
|
||||
# - Outputs (PNG, MD, HTML) are written to `scratch/`.
|
||||
#
|
||||
# Key options
|
||||
# - `IMAGE_RESOLUTION_SCALE`: increase to render higher-resolution images (e.g., 2.0).
|
||||
# - `PdfPipelineOptions.generate_page_images`/`generate_picture_images`: preserve images for export.
|
||||
# - `ImageRefMode`: choose `EMBEDDED` or `REFERENCED` when saving Markdown/HTML.
|
||||
#
|
||||
# Input document
|
||||
# - Defaults to `tests/data/pdf/2206.01062.pdf`. Change `input_doc_path` as needed.
|
||||
|
||||
# %%
|
||||
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
@@ -20,12 +46,9 @@ def main():
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
output_dir = Path("scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
# The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
|
||||
# with the image field
|
||||
# Keep page/element images so they can be exported. The `images_scale` controls
|
||||
# the rendered image resolution (scale=1 ~ 72 DPI). The `generate_*` toggles
|
||||
# decide which elements are enriched with images.
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options.generate_page_images = True
|
||||
|
||||
37
docs/examples/export_multimodal.py
vendored
37
docs/examples/export_multimodal.py
vendored
@@ -1,3 +1,34 @@
|
||||
# %% [markdown]
|
||||
# Export multimodal page data (image bytes, text, segments) to a Parquet file.
|
||||
#
|
||||
# What this example does
|
||||
# - Converts a PDF and assembles per-page multimodal records: image, cells, text, segments.
|
||||
# - Normalizes records to a pandas DataFrame and writes a timestamped `.parquet` in `scratch/`.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling and `pandas`. Optional: `datasets` and `Pillow` for the commented demo.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/export_multimodal.py`.
|
||||
# - Output parquet is written to `scratch/`.
|
||||
#
|
||||
# Key options
|
||||
# - `IMAGE_RESOLUTION_SCALE`: page rendering scale (1 ~ 72 DPI).
|
||||
# - `PdfPipelineOptions.generate_page_images`: keep page images for export.
|
||||
#
|
||||
# Requirements
|
||||
# - Writing Parquet requires an engine such as `pyarrow` or `fastparquet`
|
||||
# (`pip install pyarrow` is the most common choice).
|
||||
#
|
||||
# Input document
|
||||
# - Defaults to `tests/data/pdf/2206.01062.pdf`. Change `input_doc_path` as needed.
|
||||
#
|
||||
# Notes
|
||||
# - The commented block at the bottom shows how to load the Parquet with HF Datasets
|
||||
# and reconstruct images from raw bytes.
|
||||
|
||||
# %%
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import time
|
||||
@@ -23,10 +54,8 @@ def main():
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
output_dir = Path("scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
# Keep page images so they can be exported to the multimodal rows.
|
||||
# Use PdfPipelineOptions.images_scale to control the render scale (1 ~ 72 DPI).
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options.generate_page_images = True
|
||||
|
||||
28
docs/examples/export_tables.py
vendored
28
docs/examples/export_tables.py
vendored
@@ -1,3 +1,27 @@
|
||||
# %% [markdown]
|
||||
# Extract tables from a PDF and export them as CSV and HTML.
|
||||
#
|
||||
# What this example does
|
||||
# - Converts a PDF and iterates detected tables.
|
||||
# - Prints each table as Markdown to stdout, and saves CSV/HTML to `scratch/`.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling and `pandas`.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/export_tables.py`.
|
||||
# - Outputs are written to `scratch/`.
|
||||
#
|
||||
# Input document
|
||||
# - Defaults to `tests/data/pdf/2206.01062.pdf`. Change `input_doc_path` as needed.
|
||||
#
|
||||
# Notes
|
||||
# - `table.export_to_dataframe()` returns a pandas DataFrame for convenient export/processing.
|
||||
# - Printing via `DataFrame.to_markdown()` may require the optional `tabulate` package
|
||||
# (`pip install tabulate`). If unavailable, skip the print or use `to_csv()`.
|
||||
|
||||
# %%
|
||||
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
@@ -32,12 +56,12 @@ def main():
|
||||
print(f"## Table {table_ix}")
|
||||
print(table_df.to_markdown())
|
||||
|
||||
# Save the table as csv
|
||||
# Save the table as CSV
|
||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
|
||||
_log.info(f"Saving CSV table to {element_csv_filename}")
|
||||
table_df.to_csv(element_csv_filename)
|
||||
|
||||
# Save the table as html
|
||||
# Save the table as HTML
|
||||
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
|
||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||
with element_html_filename.open("w") as fp:
|
||||
|
||||
31
docs/examples/full_page_ocr.py
vendored
31
docs/examples/full_page_ocr.py
vendored
@@ -1,3 +1,31 @@
|
||||
# %% [markdown]
|
||||
# Force full-page OCR on a PDF using different OCR backends.
|
||||
#
|
||||
# What this example does
|
||||
# - Enables full-page OCR and table structure extraction for a sample PDF.
|
||||
# - Demonstrates how to switch between OCR backends via `ocr_options`.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling and the desired OCR backend's dependencies (Tesseract, EasyOCR,
|
||||
# RapidOCR, or macOS OCR).
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/full_page_ocr.py`.
|
||||
# - The script prints Markdown text to stdout.
|
||||
#
|
||||
# Choosing an OCR backend
|
||||
# - Uncomment one `ocr_options = ...` line below. Exactly one should be active.
|
||||
# - `force_full_page_ocr=True` processes each page purely via OCR (often slower
|
||||
# than hybrid detection). Use when layout extraction is unreliable or the PDF
|
||||
# contains scanned pages.
|
||||
# - If you switch OCR backends, ensure the corresponding option class is imported,
|
||||
# e.g., `EasyOcrOptions`, `TesseractOcrOptions`, `OcrMacOptions`, `RapidOcrOptions`.
|
||||
#
|
||||
# Input document
|
||||
# - Defaults to `tests/data/pdf/2206.01062.pdf`. Change `input_doc_path` as needed.
|
||||
|
||||
# %%
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@@ -17,7 +45,8 @@ def main():
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions
|
||||
# Any of the OCR options can be used: EasyOcrOptions, TesseractOcrOptions,
|
||||
# TesseractCliOcrOptions, OcrMacOptions (macOS only), RapidOcrOptions
|
||||
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
|
||||
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
|
||||
# ocr_options = OcrMacOptions(force_full_page_ocr=True)
|
||||
|
||||
20
docs/examples/inspect_picture_content.py
vendored
20
docs/examples/inspect_picture_content.py
vendored
@@ -1,12 +1,30 @@
|
||||
# %% [markdown]
|
||||
# Inspect the contents associated with each picture in a converted document.
|
||||
#
|
||||
# What this example does
|
||||
# - Converts a PDF and iterates over each PictureItem.
|
||||
# - Prints the caption and the textual items contained within the picture region.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/inspect_picture_content.py`.
|
||||
#
|
||||
# Notes
|
||||
# - Uncomment `picture.get_image(doc).show()` to visually inspect each picture.
|
||||
# - Adjust `source` to point to a different PDF if desired.
|
||||
|
||||
# %%
|
||||
|
||||
from docling_core.types.doc import TextItem
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
# Change this to a local path if desired
|
||||
source = "tests/data/pdf/amt_handbook_sample.pdf"
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
# Higher scale yields sharper crops when inspecting picture content.
|
||||
pipeline_options.images_scale = 2
|
||||
pipeline_options.generate_page_images = True
|
||||
|
||||
@@ -19,7 +37,7 @@ result = doc_converter.convert(source)
|
||||
doc = result.document
|
||||
|
||||
for picture in doc.pictures:
|
||||
# picture.get_image(doc).show() # display the picture
|
||||
# picture.get_image(doc).show() # display the picture
|
||||
print(picture.caption_text(doc), " contains these elements:")
|
||||
|
||||
for item, level in doc.iterate_items(root=picture, traverse_pictures=True):
|
||||
|
||||
31
docs/examples/minimal.py
vendored
31
docs/examples/minimal.py
vendored
@@ -1,9 +1,32 @@
|
||||
# %% [markdown]
|
||||
# Simple conversion: one document to Markdown
|
||||
# ==========================================
|
||||
#
|
||||
# What this example does
|
||||
# - Converts a single source (URL or local file path) to a unified Docling
|
||||
# document and prints Markdown to stdout.
|
||||
#
|
||||
# Requirements
|
||||
# - Python 3.9+
|
||||
# - Install Docling: `pip install docling`
|
||||
#
|
||||
# How to run
|
||||
# - Use the default sample URL: `python docs/examples/minimal.py`
|
||||
# - To use your own file or URL, edit the `source` variable below.
|
||||
#
|
||||
# Notes
|
||||
# - The converter auto-detects supported formats (PDF, DOCX, HTML, PPTX, images, etc.).
|
||||
# - For batch processing or saving outputs to files, see `docs/examples/batch_convert.py`.
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
||||
# Change this to a local path or another URL if desired.
|
||||
# Note: using the default URL requires network access; if offline, provide a
|
||||
# local file path (e.g., Path("/path/to/file.pdf")).
|
||||
source = "https://arxiv.org/pdf/2408.09869"
|
||||
|
||||
converter = DocumentConverter()
|
||||
doc = converter.convert(source).document
|
||||
result = converter.convert(source)
|
||||
|
||||
print(doc.export_to_markdown())
|
||||
# output: ## Docling Technical Report [...]"
|
||||
# Print Markdown to stdout.
|
||||
print(result.document.export_to_markdown())
|
||||
|
||||
33
docs/examples/minimal_asr_pipeline.py
vendored
33
docs/examples/minimal_asr_pipeline.py
vendored
@@ -1,3 +1,28 @@
|
||||
# %% [markdown]
|
||||
# Minimal ASR pipeline example: transcribe an audio file to Markdown text.
|
||||
#
|
||||
# What this example does
|
||||
# - Configures the ASR pipeline with a default model spec and converts one audio file.
|
||||
# - Prints the recognized speech segments in Markdown with timestamps.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling with ASR extras and any audio dependencies (ffmpeg, etc.).
|
||||
# - Ensure your environment can download or access the configured ASR model.
|
||||
# - Some formats require ffmpeg codecs; install ffmpeg and ensure it's on PATH.
|
||||
#
|
||||
# How to run
|
||||
# - From the repository root, run: `python docs/examples/minimal_asr_pipeline.py`.
|
||||
# - The script prints the transcription to stdout.
|
||||
#
|
||||
# Customizing the model
|
||||
# - Edit `get_asr_converter()` to switch `asr_model_specs` (e.g., language or model size).
|
||||
# - Keep `InputFormat.AUDIO` and `AsrPipeline` unchanged for a minimal setup.
|
||||
#
|
||||
# Input audio
|
||||
# - Defaults to `tests/data/audio/sample_10s.mp3`. Update `audio_path` to your own file if needed.
|
||||
|
||||
# %%
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
@@ -11,7 +36,11 @@ from docling.pipeline.asr_pipeline import AsrPipeline
|
||||
|
||||
|
||||
def get_asr_converter():
|
||||
"""Create a DocumentConverter configured for ASR with whisper_turbo model."""
|
||||
"""Create a DocumentConverter configured for ASR with a default model.
|
||||
|
||||
Uses `asr_model_specs.WHISPER_TURBO` by default. You can swap in another
|
||||
model spec from `docling.datamodel.asr_model_specs` to experiment.
|
||||
"""
|
||||
pipeline_options = AsrPipelineOptions()
|
||||
pipeline_options.asr_options = asr_model_specs.WHISPER_TURBO
|
||||
|
||||
@@ -27,7 +56,7 @@ def get_asr_converter():
|
||||
|
||||
|
||||
def asr_pipeline_conversion(audio_path: Path) -> DoclingDocument:
|
||||
"""ASR pipeline conversion using whisper_turbo"""
|
||||
"""Run the ASR pipeline and return a `DoclingDocument` transcript."""
|
||||
# Check if the test audio file exists
|
||||
assert audio_path.exists(), f"Test audio file not found: {audio_path}"
|
||||
|
||||
|
||||
26
docs/examples/minimal_vlm_pipeline.py
vendored
26
docs/examples/minimal_vlm_pipeline.py
vendored
@@ -1,3 +1,25 @@
|
||||
# %% [markdown]
|
||||
# Minimal VLM pipeline example: convert a PDF using a vision-language model.
|
||||
#
|
||||
# What this example does
|
||||
# - Runs the VLM-powered pipeline on a PDF (by URL) and prints Markdown output.
|
||||
# - Shows two setups: default (Transformers/SmolDocling) and macOS MPS/MLX.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling with VLM extras and the appropriate backend (Transformers or MLX).
|
||||
# - Ensure your environment can download model weights (e.g., from Hugging Face).
|
||||
#
|
||||
# How to run
|
||||
# - From the repository root, run: `python docs/examples/minimal_vlm_pipeline.py`.
|
||||
# - The script prints the converted Markdown to stdout.
|
||||
#
|
||||
# Notes
|
||||
# - `source` may be a local path or a URL to a PDF.
|
||||
# - The second section demonstrates macOS MPS acceleration via MLX (`vlm_model_specs.SMOLDOCLING_MLX`).
|
||||
# - For more configurations and model comparisons, see `docs/examples/compare_vlm_models.py`.
|
||||
|
||||
# %%
|
||||
|
||||
from docling.datamodel import vlm_model_specs
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
@@ -6,6 +28,7 @@ from docling.datamodel.pipeline_options import (
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
# Convert a public arXiv PDF; replace with a local path if preferred.
|
||||
source = "https://arxiv.org/pdf/2501.17887"
|
||||
|
||||
###### USING SIMPLE DEFAULT VALUES
|
||||
@@ -26,7 +49,8 @@ print(doc.export_to_markdown())
|
||||
|
||||
|
||||
###### USING MACOS MPS ACCELERATOR
|
||||
# For more options see the compare_vlm_models.py example.
|
||||
# Demonstrates using MLX on macOS with MPS acceleration (macOS only).
|
||||
# For more options see the `compare_vlm_models.py` example.
|
||||
|
||||
pipeline_options = VlmPipelineOptions(
|
||||
vlm_options=vlm_model_specs.SMOLDOCLING_MLX,
|
||||
|
||||
24
docs/examples/pictures_description_api.py
vendored
24
docs/examples/pictures_description_api.py
vendored
@@ -1,3 +1,27 @@
|
||||
# %% [markdown]
|
||||
# Describe pictures using a remote VLM API (vLLM, LM Studio, or watsonx.ai).
|
||||
#
|
||||
# What this example does
|
||||
# - Configures `PictureDescriptionApiOptions` for local or cloud providers.
|
||||
# - Converts a PDF, then prints each picture's caption and annotations.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling and `python-dotenv` if loading env vars from a `.env` file.
|
||||
# - For local providers: ensure vLLM or LM Studio is running.
|
||||
# - For watsonx.ai: set `WX_API_KEY` and `WX_PROJECT_ID` in the environment.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/pictures_description_api.py`.
|
||||
# - Uncomment exactly one provider config and set `enable_remote_services=True` (already set).
|
||||
#
|
||||
# Notes
|
||||
# - vLLM default endpoint: `http://localhost:8000/v1/chat/completions`.
|
||||
# - LM Studio default endpoint: `http://localhost:1234/v1/chat/completions`.
|
||||
# - Calling remote APIs sends page images/text to the provider; review privacy and
|
||||
# costs. For local testing, LM Studio runs everything on your machine.
|
||||
|
||||
# %%
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
37
docs/examples/rapidocr_with_custom_models.py
vendored
37
docs/examples/rapidocr_with_custom_models.py
vendored
@@ -1,25 +1,46 @@
|
||||
# %% [markdown]
|
||||
# Use RapidOCR with custom ONNX models to OCR a PDF page and print Markdown.
|
||||
#
|
||||
# What this example does
|
||||
# - Downloads RapidOCR models from Hugging Face via ModelScope.
|
||||
# - Configures `RapidOcrOptions` with explicit det/rec/cls model paths.
|
||||
# - Runs the PDF pipeline with RapidOCR and prints Markdown output.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling, `modelscope`, and have network access to download models.
|
||||
# - Ensure your environment can import `docling` and `modelscope`.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/rapidocr_with_custom_models.py`.
|
||||
# - The script prints the recognized text as Markdown to stdout.
|
||||
#
|
||||
# Notes
|
||||
# - The default `source` points to an arXiv PDF URL; replace with a local path if desired.
|
||||
# - Model paths are derived from the downloaded snapshot directory.
|
||||
# - ModelScope caches downloads (typically under `~/.cache/modelscope`); set a proxy
|
||||
# or pre-download models if running in a restricted network environment.
|
||||
|
||||
# %%
|
||||
|
||||
import os
|
||||
|
||||
from modelscope import snapshot_download
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
|
||||
from docling.document_converter import (
|
||||
ConversionResult,
|
||||
DocumentConverter,
|
||||
InputFormat,
|
||||
PdfFormatOption,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
# Source document to convert
|
||||
source = "https://arxiv.org/pdf/2408.09869v4"
|
||||
|
||||
# Download RappidOCR models from HuggingFace
|
||||
# Download RapidOCR models from Hugging Face
|
||||
print("Downloading RapidOCR models")
|
||||
download_path = snapshot_download(repo_id="RapidAI/RapidOCR")
|
||||
|
||||
# Setup RapidOcrOptions for english detection
|
||||
# Setup RapidOcrOptions for English detection
|
||||
det_model_path = os.path.join(
|
||||
download_path, "onnx", "PP-OCRv5", "det", "ch_PP-OCRv5_server_det.onnx"
|
||||
)
|
||||
|
||||
19
docs/examples/run_with_accelerator.py
vendored
19
docs/examples/run_with_accelerator.py
vendored
@@ -1,3 +1,22 @@
|
||||
# %% [markdown]
|
||||
# Run conversion with an explicit accelerator configuration (CPU/MPS/CUDA).
|
||||
#
|
||||
# What this example does
|
||||
# - Shows how to select the accelerator device and thread count.
|
||||
# - Enables OCR and table structure to exercise compute paths, and prints timings.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/run_with_accelerator.py`.
|
||||
# - Toggle the commented `AcceleratorOptions` examples to try AUTO/MPS/CUDA.
|
||||
#
|
||||
# Notes
|
||||
# - EasyOCR does not support `cuda:N` device selection (defaults to `cuda:0`).
|
||||
# - `settings.debug.profile_pipeline_timings = True` prints profiling details.
|
||||
# - `AcceleratorDevice.MPS` is macOS-only; `CUDA` requires a compatible GPU and
|
||||
# CUDA-enabled PyTorch build. CPU mode works everywhere.
|
||||
|
||||
# %%
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
|
||||
77
docs/examples/run_with_formats.py
vendored
77
docs/examples/run_with_formats.py
vendored
@@ -1,3 +1,32 @@
|
||||
# %% [markdown]
|
||||
# Run conversion across multiple input formats and customize handling per type.
|
||||
#
|
||||
# What this example does
|
||||
# - Demonstrates converting a mixed list of files (PDF, DOCX, PPTX, HTML, images, etc.).
|
||||
# - Shows how to restrict `allowed_formats` and override `format_options` per format.
|
||||
# - Writes results (Markdown, JSON, YAML) to `scratch/`.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling and any format-specific dependencies (e.g., for DOCX/PPTX parsing).
|
||||
# - Ensure you can import `docling` from your Python environment.
|
||||
# - YAML export requires `PyYAML` (`pip install pyyaml`).
|
||||
#
|
||||
# How to run
|
||||
# - From the repository root, run: `python docs/examples/run_with_formats.py`.
|
||||
# - Outputs are written under `scratch/` next to where you run the script.
|
||||
# - If `scratch/` does not exist, create it before running.
|
||||
#
|
||||
# Customizing inputs
|
||||
# - Update `input_paths` to include or remove files on your machine.
|
||||
# - Non-whitelisted formats are ignored (see `allowed_formats`).
|
||||
#
|
||||
# Notes
|
||||
# - `allowed_formats`: explicit whitelist of formats that will be processed.
|
||||
# - `format_options`: per-format pipeline/backend overrides. Everything is optional; defaults exist.
|
||||
# - Exports: per input, writes `<stem>.md`, `<stem>.json`, and `<stem>.yaml` in `scratch/`.
|
||||
|
||||
# %%
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
@@ -34,39 +63,41 @@ def main():
|
||||
|
||||
## to customize use:
|
||||
|
||||
doc_converter = (
|
||||
DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
allowed_formats=[
|
||||
InputFormat.PDF,
|
||||
InputFormat.IMAGE,
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
InputFormat.ASCIIDOC,
|
||||
InputFormat.CSV,
|
||||
InputFormat.MD,
|
||||
], # whitelist formats, non-matching files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
||||
),
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||
),
|
||||
},
|
||||
)
|
||||
# Below we explicitly whitelist formats and override behavior for some of them.
|
||||
# You can omit this block and use the defaults (see above) for a quick start.
|
||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
allowed_formats=[
|
||||
InputFormat.PDF,
|
||||
InputFormat.IMAGE,
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
InputFormat.ASCIIDOC,
|
||||
InputFormat.CSV,
|
||||
InputFormat.MD,
|
||||
], # whitelist formats, non-matching files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
||||
),
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_cls=SimplePipeline # or set a backend, e.g., MsWordDocumentBackend
|
||||
# If you change the backend, remember to import it, e.g.:
|
||||
# from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
conv_results = doc_converter.convert_all(input_paths)
|
||||
|
||||
for res in conv_results:
|
||||
out_path = Path("scratch")
|
||||
out_path = Path("scratch") # ensure this directory exists before running
|
||||
print(
|
||||
f"Document {res.input.file.name} converted."
|
||||
f"\nSaved markdown output to: {out_path!s}"
|
||||
)
|
||||
_log.debug(res.document._export_to_indented_text(max_text_len=16))
|
||||
# Export Docling document format to markdowndoc:
|
||||
# Export Docling document to Markdown:
|
||||
with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
|
||||
fp.write(res.document.export_to_markdown())
|
||||
|
||||
|
||||
19
docs/examples/tesseract_lang_detection.py
vendored
19
docs/examples/tesseract_lang_detection.py
vendored
@@ -1,3 +1,22 @@
|
||||
# %% [markdown]
|
||||
# Detect language automatically with Tesseract OCR and force full-page OCR.
|
||||
#
|
||||
# What this example does
|
||||
# - Configures Tesseract (CLI in this snippet) with `lang=["auto"]`.
|
||||
# - Forces full-page OCR and prints the recognized text as Markdown.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/tesseract_lang_detection.py`.
|
||||
# - Ensure Tesseract CLI (or library) is installed and on PATH.
|
||||
#
|
||||
# Notes
|
||||
# - You can switch to `TesseractOcrOptions` instead of `TesseractCliOcrOptions`.
|
||||
# - Language packs must be installed; set `TESSDATA_PREFIX` if Tesseract
|
||||
# cannot find language data. Using `lang=["auto"]` requires traineddata
|
||||
# that supports script/language detection on your system.
|
||||
|
||||
# %%
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
|
||||
23
docs/examples/translate.py
vendored
23
docs/examples/translate.py
vendored
@@ -1,3 +1,23 @@
|
||||
# %% [markdown]
|
||||
# Translate extracted text content and regenerate Markdown with embedded images.
|
||||
#
|
||||
# What this example does
|
||||
# - Converts a PDF and saves original Markdown with embedded images.
|
||||
# - Translates text elements and table cell contents, then saves a translated Markdown.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling. Add a translation library of your choice inside `translate()`.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/translate.py`.
|
||||
# - The script writes original and translated Markdown to `scratch/`.
|
||||
#
|
||||
# Notes
|
||||
# - `translate()` is a placeholder; integrate your preferred translation API/client.
|
||||
# - Image generation is enabled to preserve embedded images in the output.
|
||||
|
||||
# %%
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
@@ -32,7 +52,7 @@ def main():
|
||||
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
output_dir = Path("scratch")
|
||||
output_dir = Path("scratch") # ensure this directory exists before saving
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
@@ -56,6 +76,7 @@ def main():
|
||||
doc_filename = conv_res.input.file.name
|
||||
|
||||
# Save markdown with embedded pictures in original text
|
||||
# Tip: create the `scratch/` folder first or adjust `output_dir`.
|
||||
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
|
||||
conv_doc.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)
|
||||
|
||||
|
||||
39
docs/examples/vlm_pipeline_api_model.py
vendored
39
docs/examples/vlm_pipeline_api_model.py
vendored
@@ -1,3 +1,31 @@
|
||||
# %% [markdown]
|
||||
# Use the VLM pipeline with remote API models (LM Studio, Ollama, watsonx.ai).
|
||||
#
|
||||
# What this example does
|
||||
# - Shows how to configure `ApiVlmOptions` for different VLM providers.
|
||||
# - Converts a single PDF page using the VLM pipeline and prints Markdown.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling with VLM extras and `python-dotenv` if using environment files.
|
||||
# - For local APIs: run LM Studio (HTTP server) or Ollama locally.
|
||||
# - For cloud APIs: set required environment variables (see below).
|
||||
# - Requires `requests` for HTTP calls and `python-dotenv` if loading env vars from `.env`.
|
||||
#
|
||||
# How to run
|
||||
# - From the repo root: `python docs/examples/vlm_pipeline_api_model.py`.
|
||||
# - The script prints the converted Markdown to stdout.
|
||||
#
|
||||
# Choosing a provider
|
||||
# - Uncomment exactly one `pipeline_options.vlm_options = ...` block below.
|
||||
# - Keep `enable_remote_services=True` to permit calling remote APIs.
|
||||
#
|
||||
# Notes
|
||||
# - LM Studio default endpoint: `http://localhost:1234/v1/chat/completions`.
|
||||
# - Ollama default endpoint: `http://localhost:11434/v1/chat/completions`.
|
||||
# - watsonx.ai requires `WX_API_KEY` and `WX_PROJECT_ID` in env/`.env`.
|
||||
|
||||
# %%
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -170,14 +198,16 @@ def main():
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2305.03393v1-pg9.pdf"
|
||||
|
||||
# Configure the VLM pipeline. Enabling remote services allows HTTP calls to
|
||||
# locally hosted APIs (LM Studio, Ollama) or cloud services.
|
||||
pipeline_options = VlmPipelineOptions(
|
||||
enable_remote_services=True # <-- this is required!
|
||||
enable_remote_services=True # required when calling remote VLM endpoints
|
||||
)
|
||||
|
||||
# The ApiVlmOptions() allows to interface with APIs supporting
|
||||
# the multi-modal chat interface. Here follow a few example on how to configure those.
|
||||
|
||||
# One possibility is self-hosting model, e.g. via LM Studio, Ollama or others.
|
||||
# One possibility is self-hosting the model, e.g., via LM Studio or Ollama.
|
||||
|
||||
# Example using the SmolDocling model with LM Studio:
|
||||
# (uncomment the following lines)
|
||||
@@ -208,8 +238,9 @@ def main():
|
||||
# prompt="OCR the full page to markdown.",
|
||||
# )
|
||||
|
||||
# Another possibility is using online services, e.g. watsonx.ai.
|
||||
# Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
|
||||
# Another possibility is using online services, e.g., watsonx.ai.
|
||||
# Using watsonx.ai requires setting env variables WX_API_KEY and WX_PROJECT_ID
|
||||
# (see the top-level docstring for details). You can use a .env file as well.
|
||||
# (uncomment the following lines)
|
||||
# pipeline_options.vlm_options = watsonx_vlm_options(
|
||||
# model="ibm/granite-vision-3-2-2b", prompt="OCR the full page to markdown."
|
||||
|
||||
Reference in New Issue
Block a user