mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
CLI and error handling fixes
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
a66c4ee8eb
commit
ba9eaf1bd7
@ -393,7 +393,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if contains_lists is None:
|
||||
return cell.text
|
||||
else:
|
||||
_log.warn(
|
||||
_log.debug(
|
||||
"should extract the content correctly for table-cells with lists ..."
|
||||
)
|
||||
return cell.text
|
||||
|
@ -157,9 +157,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
new_list = None
|
||||
|
||||
if is_a_list:
|
||||
_log.info("LIST DETECTED!")
|
||||
_log.debug("LIST DETECTED!")
|
||||
else:
|
||||
_log.info("No List")
|
||||
_log.debug("No List")
|
||||
|
||||
# for e in p.iter():
|
||||
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
||||
|
@ -138,7 +138,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
try:
|
||||
self.handle_tables(element, docx_obj, doc)
|
||||
except Exception:
|
||||
_log.error("could not parse a table, broken docx table")
|
||||
_log.debug("could not parse a table, broken docx table")
|
||||
|
||||
elif found_drawing or found_pict:
|
||||
self.handle_pictures(element, docx_obj, doc)
|
||||
@ -146,7 +146,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif tag_name in ["p"]:
|
||||
self.handle_text_elements(element, docx_obj, doc)
|
||||
else:
|
||||
_log.warn(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
return doc
|
||||
|
||||
def str_to_int(self, s, default=0):
|
||||
|
@ -5,7 +5,7 @@ import time
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Iterable, List, Optional
|
||||
from typing import Annotated, Dict, Iterable, List, Optional
|
||||
|
||||
import typer
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
@ -26,7 +26,7 @@ from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
@ -152,6 +152,14 @@ def convert(
|
||||
ocr_engine: Annotated[
|
||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||
] = OcrEngine.EASYOCR,
|
||||
abort_on_error: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
"--abort-on-error/--no-abort-on-error",
|
||||
help="If enabled, the bitmap content will be processed using OCR.",
|
||||
),
|
||||
] = False,
|
||||
output: Annotated[
|
||||
Path, typer.Option(..., help="Output directory where results are saved.")
|
||||
] = Path("."),
|
||||
@ -211,18 +219,22 @@ def convert(
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=DoclingParseDocumentBackend, # pdf_backend
|
||||
)
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=DoclingParseDocumentBackend, # pdf_backend
|
||||
)
|
||||
}
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_all(input_doc_paths)
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths, raises_on_error=abort_on_error
|
||||
)
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
export_documents(
|
||||
|
@ -160,7 +160,7 @@ class InputDocument(BaseModel):
|
||||
) -> None:
|
||||
if backend is None:
|
||||
raise RuntimeError(
|
||||
f"No backend configuration provided for file {self.file} with format {self.format}. "
|
||||
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
|
||||
@ -472,8 +472,8 @@ class _DocumentConversionInput(BaseModel):
|
||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||
format = self._guess_format(obj)
|
||||
if format not in format_options.keys():
|
||||
_log.debug(
|
||||
f"Skipping input document {obj.name} because its format is not in the whitelist."
|
||||
_log.info(
|
||||
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
||||
)
|
||||
continue
|
||||
else:
|
||||
|
@ -111,6 +111,14 @@ class DocumentConverter:
|
||||
_log.debug(f"Requested format {f} will use default options.")
|
||||
self.format_to_options[f] = _format_to_default_options[f]
|
||||
|
||||
remove_keys = []
|
||||
for f in self.format_to_options.keys():
|
||||
if f not in self.allowed_formats:
|
||||
remove_keys.append(f)
|
||||
|
||||
for f in remove_keys:
|
||||
self.format_to_options.pop(f)
|
||||
|
||||
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
||||
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
@ -176,7 +184,7 @@ class DocumentConverter:
|
||||
|
||||
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
||||
for item in map(
|
||||
partial(self.process_document, raises_on_error=raises_on_error),
|
||||
partial(self._process_document, raises_on_error=raises_on_error),
|
||||
input_batch,
|
||||
):
|
||||
if item is not None:
|
||||
@ -205,22 +213,22 @@ class DocumentConverter:
|
||||
)
|
||||
return self.initialized_pipelines[pipeline_class]
|
||||
|
||||
def process_document(
|
||||
def _process_document(
|
||||
self, in_doc: InputDocument, raises_on_error: bool
|
||||
) -> Optional[ConversionResult]:
|
||||
assert self.allowed_formats is not None
|
||||
assert in_doc.format in self.allowed_formats
|
||||
|
||||
if in_doc.format not in self.allowed_formats:
|
||||
return None
|
||||
else:
|
||||
start_doc_time = time.time()
|
||||
start_doc_time = time.time()
|
||||
|
||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||
|
||||
end_doc_time = time.time() - start_doc_time
|
||||
_log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
|
||||
end_doc_time = time.time() - start_doc_time
|
||||
_log.info(
|
||||
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
|
||||
)
|
||||
|
||||
return conv_res
|
||||
return conv_res
|
||||
|
||||
def _execute_pipeline(
|
||||
self, in_doc: InputDocument, raises_on_error: bool
|
||||
@ -228,16 +236,25 @@ class DocumentConverter:
|
||||
if in_doc.valid:
|
||||
pipeline = self._get_pipeline(in_doc)
|
||||
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
if raises_on_error:
|
||||
raise RuntimeError(
|
||||
f"No pipeline could be initialized for {in_doc.file}."
|
||||
)
|
||||
else:
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||
|
||||
else:
|
||||
# invalid doc or not of desired format
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
# TODO add error log why it failed.
|
||||
if raises_on_error:
|
||||
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
|
||||
|
||||
else:
|
||||
# invalid doc or not of desired format
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
# TODO add error log why it failed.
|
||||
|
||||
return conv_res
|
||||
|
@ -34,12 +34,6 @@ class BasePipeline(ABC):
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
|
||||
if not in_doc.valid:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
# TODO: propagate option for raises_on_error?
|
||||
try:
|
||||
# These steps are building and assembling the structure of the
|
||||
# output DoclingDocument
|
||||
@ -155,7 +149,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
pass
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
|
Loading…
Reference in New Issue
Block a user