mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 22:14:37 +00:00
feat: expose ocr-lang in CLI
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
e6f89d520f
commit
e625f5d87b
@ -129,6 +129,12 @@ def export_documents(
|
||||
)
|
||||
|
||||
|
||||
def _comma_split(raw: Optional[str]) -> Optional[List[str]]:
|
||||
if raw is None:
|
||||
return None
|
||||
return raw.split(",")
|
||||
|
||||
|
||||
@app.command(no_args_is_help=True)
|
||||
def convert(
|
||||
input_sources: Annotated[
|
||||
@ -163,6 +169,13 @@ def convert(
|
||||
ocr_engine: Annotated[
|
||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||
] = OcrEngine.EASYOCR,
|
||||
ocr_lang: Annotated[
|
||||
Optional[str],
|
||||
typer.Option(
|
||||
...,
|
||||
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
|
||||
),
|
||||
] = None,
|
||||
pdf_backend: Annotated[
|
||||
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
||||
] = PdfBackend.DLPARSE_V1,
|
||||
@ -248,6 +261,10 @@ def convert(
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
ocr_lang_list = _comma_split(ocr_lang)
|
||||
if ocr_lang_list is not None:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
|
@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
|
||||
|
||||
class OcrOptions(BaseModel):
|
||||
kind: str
|
||||
lang: List[str]
|
||||
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
||||
bitmap_area_threshold: float = (
|
||||
0.05 # percentage of the area for a bitmap to processed with OCR
|
||||
|
Loading…
Reference in New Issue
Block a user