mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 07:08:19 +00:00
Merge from main, update OCR model and test cases
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -6,7 +6,11 @@ from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
|
||||
@@ -71,7 +75,7 @@ def main():
|
||||
# and PDF Backends for various configurations.
|
||||
# Uncomment one section at the time to see the differences in the output.
|
||||
|
||||
# PyPdfium without OCR
|
||||
# PyPdfium without EasyOCR
|
||||
# --------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=False
|
||||
@@ -83,7 +87,7 @@ def main():
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# )
|
||||
|
||||
# PyPdfium with OCR
|
||||
# PyPdfium with EasyOCR
|
||||
# -----------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
@@ -95,7 +99,7 @@ def main():
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse without OCR
|
||||
# Docling Parse without EasyOCR
|
||||
# -------------------------
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
@@ -108,7 +112,7 @@ def main():
|
||||
}
|
||||
)
|
||||
|
||||
# Docling Parse with OCR
|
||||
# Docling Parse with EasyOCR
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
@@ -120,6 +124,32 @@ def main():
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract CLI
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractCliOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Define input files
|
||||
|
||||
Reference in New Issue
Block a user