mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
docs: Added documentation to use SuryaOCR via plugin docling-surya (#2533)
* docs: Added documentation to use SuryaOCR via plugin `docling-surya` Signed-off-by: Harry Ho <kho7@student.umgc.edu> * Add PyPI link for docling-surya package Added a link to the PyPI page for docling-surya. Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> * Add licensing note for SuryaOCR integration Added important licensing note regarding SuryaOCR integration. Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> * Ran linter to reformat Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> --------- Signed-off-by: Harry Ho <kho7@student.umgc.edu> Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> Co-authored-by: Harry Ho <kho7@student.umgc.edu>
This commit is contained in:
2
.github/workflows/checks.yml
vendored
2
.github/workflows/checks.yml
vendored
@@ -20,7 +20,7 @@ env:
|
||||
tests/test_asr_pipeline.py
|
||||
tests/test_threaded_pipeline.py
|
||||
PYTEST_TO_SKIP: |-
|
||||
EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example|gpu_standard_pipeline|gpu_vlm_pipeline|demo_layout_vlm)\.py$'
|
||||
EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|suryaocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example|gpu_standard_pipeline|gpu_vlm_pipeline|demo_layout_vlm)\.py$'
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
|
||||
54
docs/examples/suryaocr_with_custom_models.py
vendored
Normal file
54
docs/examples/suryaocr_with_custom_models.py
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
# Example: Integrating SuryaOCR with Docling for PDF OCR and Markdown Export
|
||||
#
|
||||
# Overview:
|
||||
# - Configures SuryaOCR options for OCR.
|
||||
# - Executes PDF pipeline with SuryaOCR integration.
|
||||
# - Models auto-download from Hugging Face on first run.
|
||||
#
|
||||
# Prerequisites:
|
||||
# - Install: `pip install docling-surya`
|
||||
# - Ensure `docling` imports successfully.
|
||||
#
|
||||
# Execution:
|
||||
# - Run from repo root: `python docs/examples/suryaocr_with_custom_models.py`
|
||||
# - Outputs Markdown to stdout.
|
||||
#
|
||||
# Notes:
|
||||
# - Default source: EPA PDF URL; substitute with local path as needed.
|
||||
# - Models cached in `~/.cache/huggingface`; override with HF_HOME env var.
|
||||
# - Use proxy config for restricted networks.
|
||||
# - **Important Licensing Note**: The `docling-surya` package integrates SuryaOCR, which is licensed under the GNU General Public License (GPL).
|
||||
# Using this integration may impose GPL obligations on your project. Review the license terms carefully.
|
||||
|
||||
# Requires `pip install docling-surya`
|
||||
# See https://pypi.org/project/docling-surya/
|
||||
from docling_surya import SuryaOcrOptions
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
source = "https://19january2021snapshot.epa.gov/sites/static/files/2016-02/documents/epa_sample_letter_sent_to_commissioners_dated_february_29_2015.pdf"
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=True,
|
||||
ocr_model="suryaocr",
|
||||
allow_external_plugins=True,
|
||||
ocr_options=SuryaOcrOptions(lang=["en"]),
|
||||
)
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
}
|
||||
)
|
||||
|
||||
result = converter.convert(source)
|
||||
print(result.document.export_to_markdown())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -92,6 +92,7 @@ nav:
|
||||
- "Force full page OCR": examples/full_page_ocr.py
|
||||
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
|
||||
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
|
||||
- "SuryaOCR with custom OCR models": examples/suryaocr_with_custom_models.py
|
||||
- "Accelerator options": examples/run_with_accelerator.py
|
||||
- "Detect and obfuscate PII": examples/pii_obfuscate.py
|
||||
- "Simple translation": examples/translate.py
|
||||
|
||||
Reference in New Issue
Block a user