docs: Describe examples (#2262)

* Update .py examples with clearer guidance, update out of date imports and calls Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> * Fix minimal.py string error, fix ruff format error Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> * fix more CI issues Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --------- Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
2025-12-08 20:58:11 +00:00 · 2025-09-16 10:00:38 -04:00
parent 0e95171dd6
commit ff351fd40c
21 changed files with 608 additions and 85 deletions
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@@ -1,3 +1,39 @@
+# %% [markdown]
+# Customize PDF conversion by toggling OCR/backends and pipeline options.
+#
+# What this example does
+# - Shows several alternative configurations for the Docling PDF pipeline.
+# - Lets you try OCR engines (EasyOCR, Tesseract, system OCR) or no OCR.
+# - Converts a single sample PDF and exports results to `scratch/`.
+#
+# Prerequisites
+# - Install Docling and its optional OCR backends per the docs.
+# - Ensure you can import `docling` from your Python environment.
+#
+# How to run
+# - From the repository root, run: `python docs/examples/custom_convert.py`.
+# - Outputs are written under `scratch/` next to where you run the script.
+#
+# Choosing a configuration
+# - Only one configuration block should be active at a time.
+# - Uncomment exactly one of the sections below to experiment.
+# - The file ships with "Docling Parse with EasyOCR" enabled as a sensible default.
+# - If you uncomment a backend or OCR option that is not imported above, also
+#   import its class, e.g.:
+#   - `from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend`
+#   - `from docling.datamodel.pipeline_options import TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions`
+#
+# Input document
+# - Defaults to a single PDF from `tests/data/pdf/` in the repo.
+# - If you don't have the test data, update `input_doc_path` to a local PDF.
+#
+# Notes
+# - EasyOCR language: adjust `pipeline_options.ocr_options.lang` (e.g., ["en"], ["es"], ["en", "de"]).
+# - Accelerators: tune `AcceleratorOptions` to select CPU/GPU or threads.
+# - Exports: JSON, plain text, Markdown, and doctags are saved in `scratch/`.
+
+# %%
+
 import json
 import logging
 import time
@@ -21,9 +57,8 @@ def main():

    ###########################################################################

-    # The following sections contain a combination of PipelineOptions
-    # and PDF Backends for various configurations.
-    # Uncomment one section at the time to see the differences in the output.
+    # The sections below demo combinations of PdfPipelineOptions and backends.
+    # Tip: Uncomment exactly one section at a time to compare outputs.

    # PyPdfium without EasyOCR
    # --------------------
@@ -68,8 +103,10 @@ def main():
    #     }
    # )

-    # Docling Parse with EasyOCR
-    # ----------------------
+    # Docling Parse with EasyOCR (default)
+    # -------------------------------
+    # Enables OCR and table structure with EasyOCR, using automatic device
+    # selection via AcceleratorOptions. Adjust languages as needed.
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
@@ -86,7 +123,7 @@ def main():
    )

    # Docling Parse with EasyOCR (CPU only)
-    # ----------------------
+    # -------------------------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.ocr_options.use_gpu = False  # <-- set this.
@@ -100,7 +137,7 @@ def main():
    # )

    # Docling Parse with Tesseract
-    # ----------------------
+    # ----------------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
@@ -114,7 +151,7 @@ def main():
    # )

    # Docling Parse with Tesseract CLI
-    # ----------------------
+    # --------------------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
@@ -127,8 +164,8 @@ def main():
    #     }
    # )

-    # Docling Parse with ocrmac(Mac only)
-    # ----------------------
+    # Docling Parse with ocrmac (macOS only)
+    # --------------------------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
@@ -154,13 +191,13 @@ def main():
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_result.input.file.stem

-    # Export Deep Search document JSON format:
+    # Export Docling document JSON format:
    with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
        fp.write(json.dumps(conv_result.document.export_to_dict()))

-    # Export Text format:
+    # Export Text format (plain text via Markdown export):
    with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
-        fp.write(conv_result.document.export_to_text())
+        fp.write(conv_result.document.export_to_markdown(strict_text=True))

    # Export Markdown format:
    with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp: