From ba9eaf1bd719b2769679580006d988a6b622793a Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 15 Oct 2024 15:58:39 +0200
Subject: [PATCH] CLI and error handling fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/backend/html_backend.py         |  2 +-
 docling/backend/mspowerpoint_backend.py |  4 +-
 docling/backend/msword_backend.py       |  4 +-
 docling/cli/main.py                     | 30 ++++++++++-----
 docling/datamodel/document.py           |  6 +--
 docling/document_converter.py           | 51 ++++++++++++++++---------
 docling/pipeline/base_pipeline.py       |  8 +---
 7 files changed, 64 insertions(+), 41 deletions(-)

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 7878d64f..c7e68681 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -393,7 +393,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if contains_lists is None:
             return cell.text
         else:
-            _log.warn(
+            _log.debug(
                 "should extract the content correctly for table-cells with lists ..."
             )
             return cell.text
diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index d50287f5..876a10e1 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -157,9 +157,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                 new_list = None
 
             if is_a_list:
-                _log.info("LIST DETECTED!")
+                _log.debug("LIST DETECTED!")
             else:
-                _log.info("No List")
+                _log.debug("No List")
 
             # for e in p.iter():
             for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index cc0e2613..54136fdd 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -138,7 +138,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 try:
                     self.handle_tables(element, docx_obj, doc)
                 except Exception:
-                    _log.error("could not parse a table, broken docx table")
+                    _log.debug("could not parse a table, broken docx table")
 
             elif found_drawing or found_pict:
                 self.handle_pictures(element, docx_obj, doc)
@@ -146,7 +146,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             elif tag_name in ["p"]:
                 self.handle_text_elements(element, docx_obj, doc)
             else:
-                _log.warn(f"Ignoring element in DOCX with tag: {tag_name}")
+                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
         return doc
 
     def str_to_int(self, s, default=0):
diff --git a/docling/cli/main.py b/docling/cli/main.py
index 6610cef2..f97e4938 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -5,7 +5,7 @@ import time
 import warnings
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Iterable, List, Optional
+from typing import Annotated, Dict, Iterable, List, Optional
 
 import typer
 from docling_core.utils.file import resolve_file_source
@@ -26,7 +26,7 @@ from docling.datamodel.pipeline_options import (
     TesseractCliOcrOptions,
     TesseractOcrOptions,
 )
-from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -152,6 +152,14 @@ def convert(
     ocr_engine: Annotated[
         OcrEngine, typer.Option(..., help="The OCR engine to use.")
     ] = OcrEngine.EASYOCR,
+    abort_on_error: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--abort-on-error/--no-abort-on-error",
+            help="If enabled, the bitmap content will be processed using OCR.",
+        ),
+    ] = False,
     output: Annotated[
         Path, typer.Option(..., help="Output directory where results are saved.")
     ] = Path("."),
@@ -211,18 +219,22 @@ def convert(
     )
     pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
 
+    format_options: Dict[InputFormat, FormatOption] = {
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_options=pipeline_options,
+            backend=DoclingParseDocumentBackend,  # pdf_backend
+        )
+    }
     doc_converter = DocumentConverter(
-        format_options={
-            InputFormat.PDF: PdfFormatOption(
-                pipeline_options=pipeline_options,
-                backend=DoclingParseDocumentBackend,  # pdf_backend
-            )
-        }
+        allowed_formats=from_formats,
+        format_options=format_options,
     )
 
     start_time = time.time()
 
-    conv_results = doc_converter.convert_all(input_doc_paths)
+    conv_results = doc_converter.convert_all(
+        input_doc_paths, raises_on_error=abort_on_error
+    )
 
     output.mkdir(parents=True, exist_ok=True)
     export_documents(
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index bcc0254e..41d62114 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -160,7 +160,7 @@ class InputDocument(BaseModel):
     ) -> None:
         if backend is None:
             raise RuntimeError(
-                f"No backend configuration provided for file {self.file} with format {self.format}. "
+                f"No backend configuration provided for file {self.file.name} with format {self.format}. "
                 f"Please check your format configuration on DocumentConverter."
             )
 
@@ -472,8 +472,8 @@ class _DocumentConversionInput(BaseModel):
             obj = resolve_file_source(item) if isinstance(item, str) else item
             format = self._guess_format(obj)
             if format not in format_options.keys():
-                _log.debug(
-                    f"Skipping input document {obj.name} because its format is not in the whitelist."
+                _log.info(
+                    f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
                 )
                 continue
             else:
diff --git a/docling/document_converter.py b/docling/document_converter.py
index 017a2096..a44dc9ce 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -111,6 +111,14 @@ class DocumentConverter:
                     _log.debug(f"Requested format {f} will use default options.")
                     self.format_to_options[f] = _format_to_default_options[f]
 
+            remove_keys = []
+            for f in self.format_to_options.keys():
+                if f not in self.allowed_formats:
+                    remove_keys.append(f)
+
+            for f in remove_keys:
+                self.format_to_options.pop(f)
+
         self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
 
     @validate_call(config=ConfigDict(strict=True))
@@ -176,7 +184,7 @@ class DocumentConverter:
 
             # Note: PDF backends are not thread-safe, thread pool usage was disabled.
             for item in map(
-                partial(self.process_document, raises_on_error=raises_on_error),
+                partial(self._process_document, raises_on_error=raises_on_error),
                 input_batch,
             ):
                 if item is not None:
@@ -205,22 +213,22 @@ class DocumentConverter:
             )
         return self.initialized_pipelines[pipeline_class]
 
-    def process_document(
+    def _process_document(
         self, in_doc: InputDocument, raises_on_error: bool
     ) -> Optional[ConversionResult]:
         assert self.allowed_formats is not None
+        assert in_doc.format in self.allowed_formats
 
-        if in_doc.format not in self.allowed_formats:
-            return None
-        else:
-            start_doc_time = time.time()
+        start_doc_time = time.time()
 
-            conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
+        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
 
-            end_doc_time = time.time() - start_doc_time
-            _log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
+        end_doc_time = time.time() - start_doc_time
+        _log.info(
+            f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
+        )
 
-            return conv_res
+        return conv_res
 
     def _execute_pipeline(
         self, in_doc: InputDocument, raises_on_error: bool
@@ -228,16 +236,25 @@ class DocumentConverter:
         if in_doc.valid:
             pipeline = self._get_pipeline(in_doc)
             if pipeline is None:  # Can't find a default pipeline. Should this raise?
-                conv_res = ConversionResult(input=in_doc)
-                conv_res.status = ConversionStatus.FAILURE
-                return conv_res
+                if raises_on_error:
+                    raise RuntimeError(
+                        f"No pipeline could be initialized for {in_doc.file}."
+                    )
+                else:
+                    conv_res = ConversionResult(input=in_doc)
+                    conv_res.status = ConversionStatus.FAILURE
+                    return conv_res
 
             conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
 
         else:
-            # invalid doc or not of desired format
-            conv_res = ConversionResult(input=in_doc)
-            conv_res.status = ConversionStatus.FAILURE
-            # TODO add error log why it failed.
+            if raises_on_error:
+                raise RuntimeError(f"Input document {in_doc.file} is not valid.")
+
+            else:
+                # invalid doc or not of desired format
+                conv_res = ConversionResult(input=in_doc)
+                conv_res.status = ConversionStatus.FAILURE
+                # TODO add error log why it failed.
 
         return conv_res
diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py
index 5e26fe0c..8dd074cc 100644
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@@ -34,12 +34,6 @@ class BasePipeline(ABC):
         conv_res = ConversionResult(input=in_doc)
 
         _log.info(f"Processing document {in_doc.file.name}")
-
-        if not in_doc.valid:
-            conv_res.status = ConversionStatus.FAILURE
-            return conv_res
-
-        # TODO: propagate option for raises_on_error?
         try:
             # These steps are building and assembling the structure of the
             # output DoclingDocument
@@ -155,7 +149,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                     pass
 
                 end_pb_time = time.time() - start_pb_time
-                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
+                _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
 
         except Exception as e:
             conv_res.status = ConversionStatus.FAILURE