mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: add options for choosing OCR engines (#118)
--------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Signed-off-by: Peter Staar <taa@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
@@ -8,6 +8,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -71,7 +75,7 @@ def main():
|
||||
# and PDF Backends for various configurations.
|
||||
# Uncomment one section at the time to see the differences in the output.
|
||||
|
||||
# PyPdfium without OCR
|
||||
# PyPdfium without EasyOCR
|
||||
# --------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=False
|
||||
@@ -83,7 +87,7 @@ def main():
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# )
|
||||
|
||||
# PyPdfium with OCR
|
||||
# PyPdfium with EasyOCR
|
||||
# -----------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
@@ -95,7 +99,7 @@ def main():
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse without OCR
|
||||
# Docling Parse without EasyOCR
|
||||
# -------------------------
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
@@ -107,7 +111,7 @@ def main():
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
# Docling Parse with OCR
|
||||
# Docling Parse with EasyOCR
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
@@ -119,6 +123,32 @@ def main():
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract CLI
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractCliOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Define input files
|
||||
|
||||
Reference in New Issue
Block a user