CLI and error handling fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-15 15:58:39 +02:00
parent a66c4ee8eb
commit ba9eaf1bd7
7 changed files with 64 additions and 41 deletions

View File

@ -393,7 +393,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if contains_lists is None:
return cell.text
else:
_log.warn(
_log.debug(
"should extract the content correctly for table-cells with lists ..."
)
return cell.text

View File

@ -157,9 +157,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
new_list = None
if is_a_list:
_log.info("LIST DETECTED!")
_log.debug("LIST DETECTED!")
else:
_log.info("No List")
_log.debug("No List")
# for e in p.iter():
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):

View File

@ -138,7 +138,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
try:
self.handle_tables(element, docx_obj, doc)
except Exception:
_log.error("could not parse a table, broken docx table")
_log.debug("could not parse a table, broken docx table")
elif found_drawing or found_pict:
self.handle_pictures(element, docx_obj, doc)
@ -146,7 +146,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
elif tag_name in ["p"]:
self.handle_text_elements(element, docx_obj, doc)
else:
_log.warn(f"Ignoring element in DOCX with tag: {tag_name}")
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
def str_to_int(self, s, default=0):

View File

@ -5,7 +5,7 @@ import time
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Iterable, List, Optional
from typing import Annotated, Dict, Iterable, List, Optional
import typer
from docling_core.utils.file import resolve_file_source
@ -26,7 +26,7 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -152,6 +152,14 @@ def convert(
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
abort_on_error: Annotated[
bool,
typer.Option(
...,
"--abort-on-error/--no-abort-on-error",
help="If enabled, the bitmap content will be processed using OCR.",
),
] = False,
output: Annotated[
Path, typer.Option(..., help="Output directory where results are saved.")
] = Path("."),
@ -211,18 +219,22 @@ def convert(
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
doc_converter = DocumentConverter(
format_options={
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend, # pdf_backend
)
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
)
start_time = time.time()
conv_results = doc_converter.convert_all(input_doc_paths)
conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
)
output.mkdir(parents=True, exist_ok=True)
export_documents(

View File

@ -160,7 +160,7 @@ class InputDocument(BaseModel):
) -> None:
if backend is None:
raise RuntimeError(
f"No backend configuration provided for file {self.file} with format {self.format}. "
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
f"Please check your format configuration on DocumentConverter."
)
@ -472,8 +472,8 @@ class _DocumentConversionInput(BaseModel):
obj = resolve_file_source(item) if isinstance(item, str) else item
format = self._guess_format(obj)
if format not in format_options.keys():
_log.debug(
f"Skipping input document {obj.name} because its format is not in the whitelist."
_log.info(
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
)
continue
else:

View File

@ -111,6 +111,14 @@ class DocumentConverter:
_log.debug(f"Requested format {f} will use default options.")
self.format_to_options[f] = _format_to_default_options[f]
remove_keys = []
for f in self.format_to_options.keys():
if f not in self.allowed_formats:
remove_keys.append(f)
for f in remove_keys:
self.format_to_options.pop(f)
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
@validate_call(config=ConfigDict(strict=True))
@ -176,7 +184,7 @@ class DocumentConverter:
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
for item in map(
partial(self.process_document, raises_on_error=raises_on_error),
partial(self._process_document, raises_on_error=raises_on_error),
input_batch,
):
if item is not None:
@ -205,20 +213,20 @@ class DocumentConverter:
)
return self.initialized_pipelines[pipeline_class]
def process_document(
def _process_document(
self, in_doc: InputDocument, raises_on_error: bool
) -> Optional[ConversionResult]:
assert self.allowed_formats is not None
assert in_doc.format in self.allowed_formats
if in_doc.format not in self.allowed_formats:
return None
else:
start_doc_time = time.time()
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
end_doc_time = time.time() - start_doc_time
_log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
_log.info(
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
)
return conv_res
@ -228,12 +236,21 @@ class DocumentConverter:
if in_doc.valid:
pipeline = self._get_pipeline(in_doc)
if pipeline is None: # Can't find a default pipeline. Should this raise?
if raises_on_error:
raise RuntimeError(
f"No pipeline could be initialized for {in_doc.file}."
)
else:
conv_res = ConversionResult(input=in_doc)
conv_res.status = ConversionStatus.FAILURE
return conv_res
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
else:
if raises_on_error:
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
else:
# invalid doc or not of desired format
conv_res = ConversionResult(input=in_doc)

View File

@ -34,12 +34,6 @@ class BasePipeline(ABC):
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
if not in_doc.valid:
conv_res.status = ConversionStatus.FAILURE
return conv_res
# TODO: propagate option for raises_on_error?
try:
# These steps are building and assembling the structure of the
# output DoclingDocument
@ -155,7 +149,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
pass
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e:
conv_res.status = ConversionStatus.FAILURE