reformatted all

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-09-12 08:39:19 +02:00
parent 14ab351fdb
commit 3757c61703

View File

@ -1,11 +1,10 @@
import argparse
import json import json
import logging import logging
import time import time
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
import argparse
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.base_models import ConversionStatus, PipelineOptions
@ -16,6 +15,7 @@ _log = logging.getLogger(__name__)
from enum import Enum from enum import Enum
# Define an enum for the backend options # Define an enum for the backend options
class Backend(Enum): class Backend(Enum):
PDFIUM = "pdfium" PDFIUM = "pdfium"
@ -68,9 +68,7 @@ def export_documents(
def main(pdf, ocr, backend): def main(pdf, ocr, backend):
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_paths = [Path(pdf)]
Path(pdf)
]
########################################################################### ###########################################################################
@ -79,10 +77,10 @@ def main(pdf, ocr, backend):
# Uncomment one section at the time to see the differences in the output. # Uncomment one section at the time to see the differences in the output.
doc_converter = None doc_converter = None
if backend==Backend.PDFIUM.value and not ocr: # PyPdfium without OCR if backend == Backend.PDFIUM.value and not ocr: # PyPdfium without OCR
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr=False pipeline_options.do_ocr = False
pipeline_options.do_table_structure=True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = False pipeline_options.table_structure_options.do_cell_matching = False
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
@ -90,10 +88,10 @@ def main(pdf, ocr, backend):
pdf_backend=PyPdfiumDocumentBackend, pdf_backend=PyPdfiumDocumentBackend,
) )
elif backend==Backend.PDFIUM.value and ocr: # PyPdfium with OCR elif backend == Backend.PDFIUM.value and ocr: # PyPdfium with OCR
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr=False pipeline_options.do_ocr = False
pipeline_options.do_table_structure=True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
@ -101,7 +99,7 @@ def main(pdf, ocr, backend):
pdf_backend=PyPdfiumDocumentBackend, pdf_backend=PyPdfiumDocumentBackend,
) )
elif backend==Backend.DOCLING.value and not ocr: # Docling Parse without OCR elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
@ -112,10 +110,10 @@ def main(pdf, ocr, backend):
pdf_backend=DoclingParseDocumentBackend, pdf_backend=DoclingParseDocumentBackend,
) )
elif backend==Backend.DOCLING.value and ocr:# Docling Parse with OCR elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr=True pipeline_options.do_ocr = True
pipeline_options.do_table_structure=True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
@ -154,14 +152,18 @@ if __name__ == "__main__":
# Add arguments # Add arguments
parser.add_argument("--pdf", type=str, help="Path to the PDF file.") parser.add_argument("--pdf", type=str, help="Path to the PDF file.")
parser.add_argument("--ocr", type=bool, default=False, help="Enable OCR (True or False).") parser.add_argument(
"--ocr", type=bool, default=False, help="Enable OCR (True or False)."
)
# Add the backend option as an enum # Add the backend option as an enum
parser.add_argument("--backend", type=lambda b: Backend[b.upper()], parser.add_argument(
choices=list(Backend), default=Backend.DOCLING, "--backend",
help="Select backend (pdfium or docling). Default is docling.") type=lambda b: Backend[b.upper()],
choices=list(Backend),
default=Backend.DOCLING,
help="Select backend (pdfium or docling). Default is docling.",
)
# Parse the arguments # Parse the arguments
args = parser.parse_args() args = parser.parse_args()