CLI and error handling fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-11 14:18:30 +00:00 · 2024-10-15 15:58:39 +02:00
parent a66c4ee8eb
commit ba9eaf1bd7
7 changed files with 64 additions and 41 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -393,7 +393,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        if contains_lists is None:
            return cell.text
        else:
-            _log.warn(
+            _log.debug(
                "should extract the content correctly for table-cells with lists ..."
            )
            return cell.text
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -157,9 +157,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                new_list = None

            if is_a_list:
-                _log.info("LIST DETECTED!")
+                _log.debug("LIST DETECTED!")
            else:
-                _log.info("No List")
+                _log.debug("No List")

            # for e in p.iter():
            for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -138,7 +138,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                try:
                    self.handle_tables(element, docx_obj, doc)
                except Exception:
-                    _log.error("could not parse a table, broken docx table")
+                    _log.debug("could not parse a table, broken docx table")

            elif found_drawing or found_pict:
                self.handle_pictures(element, docx_obj, doc)
@@ -146,7 +146,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            elif tag_name in ["p"]:
                self.handle_text_elements(element, docx_obj, doc)
            else:
-                _log.warn(f"Ignoring element in DOCX with tag: {tag_name}")
+                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
        return doc

    def str_to_int(self, s, default=0):
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -5,7 +5,7 @@ import time
 import warnings
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Iterable, List, Optional
+from typing import Annotated, Dict, Iterable, List, Optional

 import typer
 from docling_core.utils.file import resolve_file_source
@@ -26,7 +26,7 @@ from docling.datamodel.pipeline_options import (
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
-from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -152,6 +152,14 @@ def convert(
    ocr_engine: Annotated[
        OcrEngine, typer.Option(..., help="The OCR engine to use.")
    ] = OcrEngine.EASYOCR,
+    abort_on_error: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--abort-on-error/--no-abort-on-error",
+            help="If enabled, the bitmap content will be processed using OCR.",
+        ),
+    ] = False,
    output: Annotated[
        Path, typer.Option(..., help="Output directory where results are saved.")
    ] = Path("."),
@@ -211,18 +219,22 @@ def convert(
    )
    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching

-    doc_converter = DocumentConverter(
-        format_options={
+    format_options: Dict[InputFormat, FormatOption] = {
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=DoclingParseDocumentBackend,  # pdf_backend
        )
    }
+    doc_converter = DocumentConverter(
+        allowed_formats=from_formats,
+        format_options=format_options,
    )

    start_time = time.time()

-    conv_results = doc_converter.convert_all(input_doc_paths)
+    conv_results = doc_converter.convert_all(
+        input_doc_paths, raises_on_error=abort_on_error
+    )

    output.mkdir(parents=True, exist_ok=True)
    export_documents(
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -160,7 +160,7 @@ class InputDocument(BaseModel):
    ) -> None:
        if backend is None:
            raise RuntimeError(
-                f"No backend configuration provided for file {self.file} with format {self.format}. "
+                f"No backend configuration provided for file {self.file.name} with format {self.format}. "
                f"Please check your format configuration on DocumentConverter."
            )

@@ -472,8 +472,8 @@ class _DocumentConversionInput(BaseModel):
            obj = resolve_file_source(item) if isinstance(item, str) else item
            format = self._guess_format(obj)
            if format not in format_options.keys():
-                _log.debug(
-                    f"Skipping input document {obj.name} because its format is not in the whitelist."
+                _log.info(
+                    f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
                )
                continue
            else:
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -111,6 +111,14 @@ class DocumentConverter:
                    _log.debug(f"Requested format {f} will use default options.")
                    self.format_to_options[f] = _format_to_default_options[f]

+            remove_keys = []
+            for f in self.format_to_options.keys():
+                if f not in self.allowed_formats:
+                    remove_keys.append(f)
+
+            for f in remove_keys:
+                self.format_to_options.pop(f)
+
        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}

    @validate_call(config=ConfigDict(strict=True))
@@ -176,7 +184,7 @@ class DocumentConverter:

            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
            for item in map(
-                partial(self.process_document, raises_on_error=raises_on_error),
+                partial(self._process_document, raises_on_error=raises_on_error),
                input_batch,
            ):
                if item is not None:
@@ -205,20 +213,20 @@ class DocumentConverter:
            )
        return self.initialized_pipelines[pipeline_class]

-    def process_document(
+    def _process_document(
        self, in_doc: InputDocument, raises_on_error: bool
    ) -> Optional[ConversionResult]:
        assert self.allowed_formats is not None
+        assert in_doc.format in self.allowed_formats

-        if in_doc.format not in self.allowed_formats:
-            return None
-        else:
        start_doc_time = time.time()

        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)

        end_doc_time = time.time() - start_doc_time
-            _log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
+        _log.info(
+            f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
+        )

        return conv_res

@@ -228,12 +236,21 @@ class DocumentConverter:
        if in_doc.valid:
            pipeline = self._get_pipeline(in_doc)
            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+                if raises_on_error:
+                    raise RuntimeError(
+                        f"No pipeline could be initialized for {in_doc.file}."
+                    )
+                else:
                    conv_res = ConversionResult(input=in_doc)
                    conv_res.status = ConversionStatus.FAILURE
                    return conv_res

            conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)

+        else:
+            if raises_on_error:
+                raise RuntimeError(f"Input document {in_doc.file} is not valid.")
+
            else:
                # invalid doc or not of desired format
                conv_res = ConversionResult(input=in_doc)
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@@ -34,12 +34,6 @@ class BasePipeline(ABC):
        conv_res = ConversionResult(input=in_doc)

        _log.info(f"Processing document {in_doc.file.name}")
-
-        if not in_doc.valid:
-            conv_res.status = ConversionStatus.FAILURE
-            return conv_res
-
-        # TODO: propagate option for raises_on_error?
        try:
            # These steps are building and assembling the structure of the
            # output DoclingDocument
@@ -155,7 +149,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                    pass

                end_pb_time = time.time() - start_pb_time
-                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
+                _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")

        except Exception as e:
            conv_res.status = ConversionStatus.FAILURE