diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 1318ee47..3ac3c303 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -20,7 +20,7 @@ env: tests/test_asr_pipeline.py tests/test_threaded_pipeline.py PYTEST_TO_SKIP: |- - EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example|gpu_standard_pipeline|gpu_vlm_pipeline|demo_layout_vlm)\.py$' + EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|suryaocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example|gpu_standard_pipeline|gpu_vlm_pipeline|demo_layout_vlm)\.py$' jobs: lint: diff --git a/docs/examples/suryaocr_with_custom_models.py b/docs/examples/suryaocr_with_custom_models.py new file mode 100644 index 00000000..15574f15 --- /dev/null +++ b/docs/examples/suryaocr_with_custom_models.py @@ -0,0 +1,54 @@ +# Example: Integrating SuryaOCR with Docling for PDF OCR and Markdown Export +# +# Overview: +# - Configures SuryaOCR options for OCR. +# - Executes PDF pipeline with SuryaOCR integration. +# - Models auto-download from Hugging Face on first run. +# +# Prerequisites: +# - Install: `pip install docling-surya` +# - Ensure `docling` imports successfully. +# +# Execution: +# - Run from repo root: `python docs/examples/suryaocr_with_custom_models.py` +# - Outputs Markdown to stdout. +# +# Notes: +# - Default source: EPA PDF URL; substitute with local path as needed. +# - Models cached in `~/.cache/huggingface`; override with HF_HOME env var. +# - Use proxy config for restricted networks. +# - **Important Licensing Note**: The `docling-surya` package integrates SuryaOCR, which is licensed under the GNU General Public License (GPL). +# Using this integration may impose GPL obligations on your project. Review the license terms carefully. + +# Requires `pip install docling-surya` +# See https://pypi.org/project/docling-surya/ +from docling_surya import SuryaOcrOptions + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def main(): + source = "https://19january2021snapshot.epa.gov/sites/static/files/2016-02/documents/epa_sample_letter_sent_to_commissioners_dated_february_29_2015.pdf" + + pipeline_options = PdfPipelineOptions( + do_ocr=True, + ocr_model="suryaocr", + allow_external_plugins=True, + ocr_options=SuryaOcrOptions(lang=["en"]), + ) + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), + InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options), + } + ) + + result = converter.convert(source) + print(result.document.export_to_markdown()) + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 1fb75bb8..7729ba95 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -92,6 +92,7 @@ nav: - "Force full page OCR": examples/full_page_ocr.py - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py + - "SuryaOCR with custom OCR models": examples/suryaocr_with_custom_models.py - "Accelerator options": examples/run_with_accelerator.py - "Detect and obfuscate PII": examples/pii_obfuscate.py - "Simple translation": examples/translate.py