mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
feat: Introduce the force-ocr cmd parameter in docling cli. Add the full_page_ocr.py example in mkdocs
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
parent
1963e7145b
commit
7234dc3a42
@ -153,6 +153,13 @@ def convert(
|
|||||||
..., help="If enabled, the bitmap content will be processed using OCR."
|
..., help="If enabled, the bitmap content will be processed using OCR."
|
||||||
),
|
),
|
||||||
] = True,
|
] = True,
|
||||||
|
force_ocr: Annotated[
|
||||||
|
bool,
|
||||||
|
typer.Option(
|
||||||
|
...,
|
||||||
|
help="Replace any existing text with OCR generated text over the full content.",
|
||||||
|
),
|
||||||
|
] = False,
|
||||||
ocr_engine: Annotated[
|
ocr_engine: Annotated[
|
||||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||||
] = OcrEngine.EASYOCR,
|
] = OcrEngine.EASYOCR,
|
||||||
@ -219,11 +226,11 @@ def convert(
|
|||||||
|
|
||||||
match ocr_engine:
|
match ocr_engine:
|
||||||
case OcrEngine.EASYOCR:
|
case OcrEngine.EASYOCR:
|
||||||
ocr_options: OcrOptions = EasyOcrOptions()
|
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
||||||
case OcrEngine.TESSERACT_CLI:
|
case OcrEngine.TESSERACT_CLI:
|
||||||
ocr_options = TesseractCliOcrOptions()
|
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
||||||
case OcrEngine.TESSERACT:
|
case OcrEngine.TESSERACT:
|
||||||
ocr_options = TesseractOcrOptions()
|
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
||||||
case _:
|
case _:
|
||||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||||
|
|
||||||
|
@ -29,7 +29,6 @@ def main():
|
|||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
backend=DoclingParseDocumentBackend,
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -69,6 +69,7 @@ nav:
|
|||||||
- "Figure enrichment": examples/develop_picture_enrichment.py
|
- "Figure enrichment": examples/develop_picture_enrichment.py
|
||||||
- "Table export": examples/export_tables.py
|
- "Table export": examples/export_tables.py
|
||||||
- "Multimodal export": examples/export_multimodal.py
|
- "Multimodal export": examples/export_multimodal.py
|
||||||
|
- "Force full page OCR": examples/full_page_ocr.py
|
||||||
- RAG / QA:
|
- RAG / QA:
|
||||||
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
|
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
|
||||||
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
|
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
|
||||||
|
Loading…
Reference in New Issue
Block a user