mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
reorder sections in custom_convert
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
8ec8c38de8
commit
471daee277
@ -72,7 +72,7 @@ def main():
|
|||||||
# and PDF Backends for various configurations.
|
# and PDF Backends for various configurations.
|
||||||
# Uncomment one section at the time to see the differences in the output.
|
# Uncomment one section at the time to see the differences in the output.
|
||||||
|
|
||||||
# PyPdfium without OCR
|
# PyPdfium without EasyOCR
|
||||||
# --------------------
|
# --------------------
|
||||||
# pipeline_options = PipelineOptions()
|
# pipeline_options = PipelineOptions()
|
||||||
# pipeline_options.do_ocr=False
|
# pipeline_options.do_ocr=False
|
||||||
@ -84,7 +84,7 @@ def main():
|
|||||||
# pdf_backend=PyPdfiumDocumentBackend,
|
# pdf_backend=PyPdfiumDocumentBackend,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
# PyPdfium with OCR
|
# PyPdfium with EasyOCR
|
||||||
# -----------------
|
# -----------------
|
||||||
# pipeline_options = PipelineOptions()
|
# pipeline_options = PipelineOptions()
|
||||||
# pipeline_options.do_ocr=True
|
# pipeline_options.do_ocr=True
|
||||||
@ -96,7 +96,7 @@ def main():
|
|||||||
# pdf_backend=PyPdfiumDocumentBackend,
|
# pdf_backend=PyPdfiumDocumentBackend,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
# Docling Parse without OCR
|
# Docling Parse without EasyOCR
|
||||||
# -------------------------
|
# -------------------------
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PipelineOptions()
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
@ -108,7 +108,7 @@ def main():
|
|||||||
pdf_backend=DoclingParseDocumentBackend,
|
pdf_backend=DoclingParseDocumentBackend,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Docling Parse with OCR
|
# Docling Parse with EasyOCR
|
||||||
# ----------------------
|
# ----------------------
|
||||||
# pipeline_options = PipelineOptions()
|
# pipeline_options = PipelineOptions()
|
||||||
# pipeline_options.do_ocr=True
|
# pipeline_options.do_ocr=True
|
||||||
@ -120,7 +120,15 @@ def main():
|
|||||||
# pdf_backend=DoclingParseDocumentBackend,
|
# pdf_backend=DoclingParseDocumentBackend,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
# Docling Parse with Tesseract OCR
|
# Docling Parse with Tesseract
|
||||||
|
# ----------------------
|
||||||
|
pipeline_options = PipelineOptions()
|
||||||
|
pipeline_options.do_ocr = True
|
||||||
|
pipeline_options.do_table_structure = True
|
||||||
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
pipeline_options.ocr_options = TesserOcrOptions()
|
||||||
|
|
||||||
|
# Docling Parse with Tesseract CLI
|
||||||
# ----------------------
|
# ----------------------
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PipelineOptions()
|
||||||
pipeline_options.do_ocr = True
|
pipeline_options.do_ocr = True
|
||||||
@ -128,14 +136,6 @@ def main():
|
|||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
pipeline_options.ocr_options = TesseractOcrOptions()
|
pipeline_options.ocr_options = TesseractOcrOptions()
|
||||||
|
|
||||||
# Docling Parse with TesserOCR
|
|
||||||
# ----------------------
|
|
||||||
# pipeline_options = PipelineOptions()
|
|
||||||
# pipeline_options.do_ocr=True
|
|
||||||
# pipeline_options.do_table_structure=True
|
|
||||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
|
||||||
# pipeline_options.ocr_options = TesserOcrOptions()
|
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
pdf_backend=DoclingParseDocumentBackend,
|
||||||
|
Loading…
Reference in New Issue
Block a user