From e625f5d87ba9858448befcd5c421e0cc4b173b62 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Tue, 19 Nov 2024 10:57:35 +0100 Subject: [PATCH] feat: expose ocr-lang in CLI Signed-off-by: Michele Dolfi --- docling/cli/main.py | 17 +++++++++++++++++ docling/datamodel/pipeline_options.py | 1 + 2 files changed, 18 insertions(+) diff --git a/docling/cli/main.py b/docling/cli/main.py index c95128ac..4554bf6f 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -129,6 +129,12 @@ def export_documents( ) +def _comma_split(raw: Optional[str]) -> Optional[List[str]]: + if raw is None: + return None + return raw.split(",") + + @app.command(no_args_is_help=True) def convert( input_sources: Annotated[ @@ -163,6 +169,13 @@ def convert( ocr_engine: Annotated[ OcrEngine, typer.Option(..., help="The OCR engine to use.") ] = OcrEngine.EASYOCR, + ocr_lang: Annotated[ + Optional[str], + typer.Option( + ..., + help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.", + ), + ] = None, pdf_backend: Annotated[ PdfBackend, typer.Option(..., help="The PDF backend to use.") ] = PdfBackend.DLPARSE_V1, @@ -248,6 +261,10 @@ def convert( case _: raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}") + ocr_lang_list = _comma_split(ocr_lang) + if ocr_lang_list is not None: + ocr_options.lang = ocr_lang_list + pipeline_options = PdfPipelineOptions( do_ocr=ocr, ocr_options=ocr_options, diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 2b9d228c..6c0711cc 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel): class OcrOptions(BaseModel): kind: str + lang: List[str] force_full_page_ocr: bool = False # If enabled a full page OCR is always applied bitmap_area_threshold: float = ( 0.05 # percentage of the area for a bitmap to processed with OCR