mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
docs: Describe examples (#2262)
* Update .py examples with clearer guidance, update out of date imports and calls Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> * Fix minimal.py string error, fix ruff format error Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> * fix more CI issues Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --------- Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
This commit is contained in:
63
docs/examples/custom_convert.py
vendored
63
docs/examples/custom_convert.py
vendored
@@ -1,3 +1,39 @@
|
||||
# %% [markdown]
|
||||
# Customize PDF conversion by toggling OCR/backends and pipeline options.
|
||||
#
|
||||
# What this example does
|
||||
# - Shows several alternative configurations for the Docling PDF pipeline.
|
||||
# - Lets you try OCR engines (EasyOCR, Tesseract, system OCR) or no OCR.
|
||||
# - Converts a single sample PDF and exports results to `scratch/`.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling and its optional OCR backends per the docs.
|
||||
# - Ensure you can import `docling` from your Python environment.
|
||||
#
|
||||
# How to run
|
||||
# - From the repository root, run: `python docs/examples/custom_convert.py`.
|
||||
# - Outputs are written under `scratch/` next to where you run the script.
|
||||
#
|
||||
# Choosing a configuration
|
||||
# - Only one configuration block should be active at a time.
|
||||
# - Uncomment exactly one of the sections below to experiment.
|
||||
# - The file ships with "Docling Parse with EasyOCR" enabled as a sensible default.
|
||||
# - If you uncomment a backend or OCR option that is not imported above, also
|
||||
# import its class, e.g.:
|
||||
# - `from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend`
|
||||
# - `from docling.datamodel.pipeline_options import TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions`
|
||||
#
|
||||
# Input document
|
||||
# - Defaults to a single PDF from `tests/data/pdf/` in the repo.
|
||||
# - If you don't have the test data, update `input_doc_path` to a local PDF.
|
||||
#
|
||||
# Notes
|
||||
# - EasyOCR language: adjust `pipeline_options.ocr_options.lang` (e.g., ["en"], ["es"], ["en", "de"]).
|
||||
# - Accelerators: tune `AcceleratorOptions` to select CPU/GPU or threads.
|
||||
# - Exports: JSON, plain text, Markdown, and doctags are saved in `scratch/`.
|
||||
|
||||
# %%
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
@@ -21,9 +57,8 @@ def main():
|
||||
|
||||
###########################################################################
|
||||
|
||||
# The following sections contain a combination of PipelineOptions
|
||||
# and PDF Backends for various configurations.
|
||||
# Uncomment one section at the time to see the differences in the output.
|
||||
# The sections below demo combinations of PdfPipelineOptions and backends.
|
||||
# Tip: Uncomment exactly one section at a time to compare outputs.
|
||||
|
||||
# PyPdfium without EasyOCR
|
||||
# --------------------
|
||||
@@ -68,8 +103,10 @@ def main():
|
||||
# }
|
||||
# )
|
||||
|
||||
# Docling Parse with EasyOCR
|
||||
# ----------------------
|
||||
# Docling Parse with EasyOCR (default)
|
||||
# -------------------------------
|
||||
# Enables OCR and table structure with EasyOCR, using automatic device
|
||||
# selection via AcceleratorOptions. Adjust languages as needed.
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.do_table_structure = True
|
||||
@@ -86,7 +123,7 @@ def main():
|
||||
)
|
||||
|
||||
# Docling Parse with EasyOCR (CPU only)
|
||||
# ----------------------
|
||||
# -------------------------------------
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.ocr_options.use_gpu = False # <-- set this.
|
||||
@@ -100,7 +137,7 @@ def main():
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract
|
||||
# ----------------------
|
||||
# ----------------------------
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
@@ -114,7 +151,7 @@ def main():
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract CLI
|
||||
# ----------------------
|
||||
# --------------------------------
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
@@ -127,8 +164,8 @@ def main():
|
||||
# }
|
||||
# )
|
||||
|
||||
# Docling Parse with ocrmac(Mac only)
|
||||
# ----------------------
|
||||
# Docling Parse with ocrmac (macOS only)
|
||||
# --------------------------------------
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
@@ -154,13 +191,13 @@ def main():
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
doc_filename = conv_result.input.file.stem
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
# Export Docling document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(conv_result.document.export_to_dict()))
|
||||
|
||||
# Export Text format:
|
||||
# Export Text format (plain text via Markdown export):
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_result.document.export_to_text())
|
||||
fp.write(conv_result.document.export_to_markdown(strict_text=True))
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
|
||||
Reference in New Issue
Block a user