diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 7878d64f..c7e68681 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -393,7 +393,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if contains_lists is None:
return cell.text
else:
- _log.warn(
+ _log.debug(
"should extract the content correctly for table-cells with lists ..."
)
return cell.text
diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index d50287f5..876a10e1 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -157,9 +157,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
new_list = None
if is_a_list:
- _log.info("LIST DETECTED!")
+ _log.debug("LIST DETECTED!")
else:
- _log.info("No List")
+ _log.debug("No List")
# for e in p.iter():
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index cc0e2613..54136fdd 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -138,7 +138,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
try:
self.handle_tables(element, docx_obj, doc)
except Exception:
- _log.error("could not parse a table, broken docx table")
+ _log.debug("could not parse a table, broken docx table")
elif found_drawing or found_pict:
self.handle_pictures(element, docx_obj, doc)
@@ -146,7 +146,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
elif tag_name in ["p"]:
self.handle_text_elements(element, docx_obj, doc)
else:
- _log.warn(f"Ignoring element in DOCX with tag: {tag_name}")
+ _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
def str_to_int(self, s, default=0):
diff --git a/docling/cli/main.py b/docling/cli/main.py
index 6610cef2..f97e4938 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -5,7 +5,7 @@ import time
import warnings
from enum import Enum
from pathlib import Path
-from typing import Annotated, Iterable, List, Optional
+from typing import Annotated, Dict, Iterable, List, Optional
import typer
from docling_core.utils.file import resolve_file_source
@@ -26,7 +26,7 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
-from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -152,6 +152,14 @@ def convert(
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
+ abort_on_error: Annotated[
+ bool,
+ typer.Option(
+ ...,
+ "--abort-on-error/--no-abort-on-error",
+ help="If enabled, the bitmap content will be processed using OCR.",
+ ),
+ ] = False,
output: Annotated[
Path, typer.Option(..., help="Output directory where results are saved.")
] = Path("."),
@@ -211,18 +219,22 @@ def convert(
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
+ format_options: Dict[InputFormat, FormatOption] = {
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=pipeline_options,
+ backend=DoclingParseDocumentBackend, # pdf_backend
+ )
+ }
doc_converter = DocumentConverter(
- format_options={
- InputFormat.PDF: PdfFormatOption(
- pipeline_options=pipeline_options,
- backend=DoclingParseDocumentBackend, # pdf_backend
- )
- }
+ allowed_formats=from_formats,
+ format_options=format_options,
)
start_time = time.time()
- conv_results = doc_converter.convert_all(input_doc_paths)
+ conv_results = doc_converter.convert_all(
+ input_doc_paths, raises_on_error=abort_on_error
+ )
output.mkdir(parents=True, exist_ok=True)
export_documents(
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index bcc0254e..41d62114 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -160,7 +160,7 @@ class InputDocument(BaseModel):
) -> None:
if backend is None:
raise RuntimeError(
- f"No backend configuration provided for file {self.file} with format {self.format}. "
+ f"No backend configuration provided for file {self.file.name} with format {self.format}. "
f"Please check your format configuration on DocumentConverter."
)
@@ -472,8 +472,8 @@ class _DocumentConversionInput(BaseModel):
obj = resolve_file_source(item) if isinstance(item, str) else item
format = self._guess_format(obj)
if format not in format_options.keys():
- _log.debug(
- f"Skipping input document {obj.name} because its format is not in the whitelist."
+ _log.info(
+ f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
)
continue
else:
diff --git a/docling/document_converter.py b/docling/document_converter.py
index 017a2096..a44dc9ce 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -111,6 +111,14 @@ class DocumentConverter:
_log.debug(f"Requested format {f} will use default options.")
self.format_to_options[f] = _format_to_default_options[f]
+ remove_keys = []
+ for f in self.format_to_options.keys():
+ if f not in self.allowed_formats:
+ remove_keys.append(f)
+
+ for f in remove_keys:
+ self.format_to_options.pop(f)
+
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
@validate_call(config=ConfigDict(strict=True))
@@ -176,7 +184,7 @@ class DocumentConverter:
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
for item in map(
- partial(self.process_document, raises_on_error=raises_on_error),
+ partial(self._process_document, raises_on_error=raises_on_error),
input_batch,
):
if item is not None:
@@ -205,22 +213,22 @@ class DocumentConverter:
)
return self.initialized_pipelines[pipeline_class]
- def process_document(
+ def _process_document(
self, in_doc: InputDocument, raises_on_error: bool
) -> Optional[ConversionResult]:
assert self.allowed_formats is not None
+ assert in_doc.format in self.allowed_formats
- if in_doc.format not in self.allowed_formats:
- return None
- else:
- start_doc_time = time.time()
+ start_doc_time = time.time()
- conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
+ conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
- end_doc_time = time.time() - start_doc_time
- _log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
+ end_doc_time = time.time() - start_doc_time
+ _log.info(
+ f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
+ )
- return conv_res
+ return conv_res
def _execute_pipeline(
self, in_doc: InputDocument, raises_on_error: bool
@@ -228,16 +236,25 @@ class DocumentConverter:
if in_doc.valid:
pipeline = self._get_pipeline(in_doc)
if pipeline is None: # Can't find a default pipeline. Should this raise?
- conv_res = ConversionResult(input=in_doc)
- conv_res.status = ConversionStatus.FAILURE
- return conv_res
+ if raises_on_error:
+ raise RuntimeError(
+ f"No pipeline could be initialized for {in_doc.file}."
+ )
+ else:
+ conv_res = ConversionResult(input=in_doc)
+ conv_res.status = ConversionStatus.FAILURE
+ return conv_res
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
else:
- # invalid doc or not of desired format
- conv_res = ConversionResult(input=in_doc)
- conv_res.status = ConversionStatus.FAILURE
- # TODO add error log why it failed.
+ if raises_on_error:
+ raise RuntimeError(f"Input document {in_doc.file} is not valid.")
+
+ else:
+ # invalid doc or not of desired format
+ conv_res = ConversionResult(input=in_doc)
+ conv_res.status = ConversionStatus.FAILURE
+ # TODO add error log why it failed.
return conv_res
diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py
index 5e26fe0c..8dd074cc 100644
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@@ -34,12 +34,6 @@ class BasePipeline(ABC):
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
-
- if not in_doc.valid:
- conv_res.status = ConversionStatus.FAILURE
- return conv_res
-
- # TODO: propagate option for raises_on_error?
try:
# These steps are building and assembling the structure of the
# output DoclingDocument
@@ -155,7 +149,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
pass
end_pb_time = time.time() - start_pb_time
- _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
+ _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e:
conv_res.status = ConversionStatus.FAILURE