CLI and error handling fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-15 15:58:39 +02:00
parent a66c4ee8eb
commit ba9eaf1bd7
7 changed files with 64 additions and 41 deletions

View File

@ -393,7 +393,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if contains_lists is None: if contains_lists is None:
return cell.text return cell.text
else: else:
_log.warn( _log.debug(
"should extract the content correctly for table-cells with lists ..." "should extract the content correctly for table-cells with lists ..."
) )
return cell.text return cell.text

View File

@ -157,9 +157,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
new_list = None new_list = None
if is_a_list: if is_a_list:
_log.info("LIST DETECTED!") _log.debug("LIST DETECTED!")
else: else:
_log.info("No List") _log.debug("No List")
# for e in p.iter(): # for e in p.iter():
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}): for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):

View File

@ -138,7 +138,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
try: try:
self.handle_tables(element, docx_obj, doc) self.handle_tables(element, docx_obj, doc)
except Exception: except Exception:
_log.error("could not parse a table, broken docx table") _log.debug("could not parse a table, broken docx table")
elif found_drawing or found_pict: elif found_drawing or found_pict:
self.handle_pictures(element, docx_obj, doc) self.handle_pictures(element, docx_obj, doc)
@ -146,7 +146,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
elif tag_name in ["p"]: elif tag_name in ["p"]:
self.handle_text_elements(element, docx_obj, doc) self.handle_text_elements(element, docx_obj, doc)
else: else:
_log.warn(f"Ignoring element in DOCX with tag: {tag_name}") _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc return doc
def str_to_int(self, s, default=0): def str_to_int(self, s, default=0):

View File

@ -5,7 +5,7 @@ import time
import warnings import warnings
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Annotated, Iterable, List, Optional from typing import Annotated, Dict, Iterable, List, Optional
import typer import typer
from docling_core.utils.file import resolve_file_source from docling_core.utils.file import resolve_file_source
@ -26,7 +26,7 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -152,6 +152,14 @@ def convert(
ocr_engine: Annotated[ ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.") OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR, ] = OcrEngine.EASYOCR,
abort_on_error: Annotated[
bool,
typer.Option(
...,
"--abort-on-error/--no-abort-on-error",
help="If enabled, the bitmap content will be processed using OCR.",
),
] = False,
output: Annotated[ output: Annotated[
Path, typer.Option(..., help="Output directory where results are saved.") Path, typer.Option(..., help="Output directory where results are saved.")
] = Path("."), ] = Path("."),
@ -211,18 +219,22 @@ def convert(
) )
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
doc_converter = DocumentConverter( format_options: Dict[InputFormat, FormatOption] = {
format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend, # pdf_backend backend=DoclingParseDocumentBackend, # pdf_backend
) )
} }
doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
) )
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_all(input_doc_paths) conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
)
output.mkdir(parents=True, exist_ok=True) output.mkdir(parents=True, exist_ok=True)
export_documents( export_documents(

View File

@ -160,7 +160,7 @@ class InputDocument(BaseModel):
) -> None: ) -> None:
if backend is None: if backend is None:
raise RuntimeError( raise RuntimeError(
f"No backend configuration provided for file {self.file} with format {self.format}. " f"No backend configuration provided for file {self.file.name} with format {self.format}. "
f"Please check your format configuration on DocumentConverter." f"Please check your format configuration on DocumentConverter."
) )
@ -472,8 +472,8 @@ class _DocumentConversionInput(BaseModel):
obj = resolve_file_source(item) if isinstance(item, str) else item obj = resolve_file_source(item) if isinstance(item, str) else item
format = self._guess_format(obj) format = self._guess_format(obj)
if format not in format_options.keys(): if format not in format_options.keys():
_log.debug( _log.info(
f"Skipping input document {obj.name} because its format is not in the whitelist." f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
) )
continue continue
else: else:

View File

@ -111,6 +111,14 @@ class DocumentConverter:
_log.debug(f"Requested format {f} will use default options.") _log.debug(f"Requested format {f} will use default options.")
self.format_to_options[f] = _format_to_default_options[f] self.format_to_options[f] = _format_to_default_options[f]
remove_keys = []
for f in self.format_to_options.keys():
if f not in self.allowed_formats:
remove_keys.append(f)
for f in remove_keys:
self.format_to_options.pop(f)
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {} self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
@validate_call(config=ConfigDict(strict=True)) @validate_call(config=ConfigDict(strict=True))
@ -176,7 +184,7 @@ class DocumentConverter:
# Note: PDF backends are not thread-safe, thread pool usage was disabled. # Note: PDF backends are not thread-safe, thread pool usage was disabled.
for item in map( for item in map(
partial(self.process_document, raises_on_error=raises_on_error), partial(self._process_document, raises_on_error=raises_on_error),
input_batch, input_batch,
): ):
if item is not None: if item is not None:
@ -205,20 +213,20 @@ class DocumentConverter:
) )
return self.initialized_pipelines[pipeline_class] return self.initialized_pipelines[pipeline_class]
def process_document( def _process_document(
self, in_doc: InputDocument, raises_on_error: bool self, in_doc: InputDocument, raises_on_error: bool
) -> Optional[ConversionResult]: ) -> Optional[ConversionResult]:
assert self.allowed_formats is not None assert self.allowed_formats is not None
assert in_doc.format in self.allowed_formats
if in_doc.format not in self.allowed_formats:
return None
else:
start_doc_time = time.time() start_doc_time = time.time()
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
end_doc_time = time.time() - start_doc_time end_doc_time = time.time() - start_doc_time
_log.info(f"Finished converting document in {end_doc_time:.2f} seconds.") _log.info(
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
)
return conv_res return conv_res
@ -228,12 +236,21 @@ class DocumentConverter:
if in_doc.valid: if in_doc.valid:
pipeline = self._get_pipeline(in_doc) pipeline = self._get_pipeline(in_doc)
if pipeline is None: # Can't find a default pipeline. Should this raise? if pipeline is None: # Can't find a default pipeline. Should this raise?
if raises_on_error:
raise RuntimeError(
f"No pipeline could be initialized for {in_doc.file}."
)
else:
conv_res = ConversionResult(input=in_doc) conv_res = ConversionResult(input=in_doc)
conv_res.status = ConversionStatus.FAILURE conv_res.status = ConversionStatus.FAILURE
return conv_res return conv_res
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error) conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
else:
if raises_on_error:
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
else: else:
# invalid doc or not of desired format # invalid doc or not of desired format
conv_res = ConversionResult(input=in_doc) conv_res = ConversionResult(input=in_doc)

View File

@ -34,12 +34,6 @@ class BasePipeline(ABC):
conv_res = ConversionResult(input=in_doc) conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}") _log.info(f"Processing document {in_doc.file.name}")
if not in_doc.valid:
conv_res.status = ConversionStatus.FAILURE
return conv_res
# TODO: propagate option for raises_on_error?
try: try:
# These steps are building and assembling the structure of the # These steps are building and assembling the structure of the
# output DoclingDocument # output DoclingDocument
@ -155,7 +149,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
pass pass
end_pb_time = time.time() - start_pb_time end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}") _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e: except Exception as e:
conv_res.status = ConversionStatus.FAILURE conv_res.status = ConversionStatus.FAILURE