docs: Added documentation to use SuryaOCR via plugin docling-surya (#2533)

* docs: Added documentation to use SuryaOCR via plugin `docling-surya` Signed-off-by: Harry Ho <kho7@student.umgc.edu> * Add PyPI link for docling-surya package Added a link to the PyPI page for docling-surya. Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> * Add licensing note for SuryaOCR integration Added important licensing note regarding SuryaOCR integration. Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> * Ran linter to reformat Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> --------- Signed-off-by: Harry Ho <kho7@student.umgc.edu> Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> Co-authored-by: Harry Ho <kho7@student.umgc.edu>
2025-12-08 12:48:28 +00:00 · 2025-11-19 22:27:24 +08:00
parent 03e7c7d924
commit b216ad848d
3 changed files with 56 additions and 1 deletions
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -20,7 +20,7 @@ env:
    tests/test_asr_pipeline.py
    tests/test_threaded_pipeline.py
  PYTEST_TO_SKIP: |-
-  EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example|gpu_standard_pipeline|gpu_vlm_pipeline|demo_layout_vlm)\.py$'
+  EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|suryaocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example|gpu_standard_pipeline|gpu_vlm_pipeline|demo_layout_vlm)\.py$'

 jobs:
  lint:
--- a/docs/examples/suryaocr_with_custom_models.py
+++ b/docs/examples/suryaocr_with_custom_models.py
@@ -0,0 +1,54 @@
+# Example: Integrating SuryaOCR with Docling for PDF OCR and Markdown Export
+#
+# Overview:
+# - Configures SuryaOCR options for OCR.
+# - Executes PDF pipeline with SuryaOCR integration.
+# - Models auto-download from Hugging Face on first run.
+#
+# Prerequisites:
+# - Install: `pip install docling-surya`
+# - Ensure `docling` imports successfully.
+#
+# Execution:
+# - Run from repo root: `python docs/examples/suryaocr_with_custom_models.py`
+# - Outputs Markdown to stdout.
+#
+# Notes:
+# - Default source: EPA PDF URL; substitute with local path as needed.
+# - Models cached in `~/.cache/huggingface`; override with HF_HOME env var.
+# - Use proxy config for restricted networks.
+# - **Important Licensing Note**: The `docling-surya` package integrates SuryaOCR, which is licensed under the GNU General Public License (GPL).
+#   Using this integration may impose GPL obligations on your project. Review the license terms carefully.
+
+# Requires `pip install docling-surya`
+# See https://pypi.org/project/docling-surya/
+from docling_surya import SuryaOcrOptions
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def main():
+    source = "https://19january2021snapshot.epa.gov/sites/static/files/2016-02/documents/epa_sample_letter_sent_to_commissioners_dated_february_29_2015.pdf"
+
+    pipeline_options = PdfPipelineOptions(
+        do_ocr=True,
+        ocr_model="suryaocr",
+        allow_external_plugins=True,
+        ocr_options=SuryaOcrOptions(lang=["en"]),
+    )
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+            InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
+        }
+    )
+
+    result = converter.convert(source)
+    print(result.document.export_to_markdown())
+
+
+if __name__ == "__main__":
+    main()
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -92,6 +92,7 @@ nav:
      - "Force full page OCR": examples/full_page_ocr.py
      - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
      - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
+      - "SuryaOCR with custom OCR models": examples/suryaocr_with_custom_models.py
      - "Accelerator options": examples/run_with_accelerator.py
      - "Detect and obfuscate PII": examples/pii_obfuscate.py
      - "Simple translation": examples/translate.py