From ba9eaf1bd719b2769679580006d988a6b622793a Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 15 Oct 2024 15:58:39 +0200 Subject: [PATCH] CLI and error handling fixes Signed-off-by: Christoph Auer --- docling/backend/html_backend.py | 2 +- docling/backend/mspowerpoint_backend.py | 4 +- docling/backend/msword_backend.py | 4 +- docling/cli/main.py | 30 ++++++++++----- docling/datamodel/document.py | 6 +-- docling/document_converter.py | 51 ++++++++++++++++--------- docling/pipeline/base_pipeline.py | 8 +--- 7 files changed, 64 insertions(+), 41 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 7878d64f..c7e68681 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -393,7 +393,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if contains_lists is None: return cell.text else: - _log.warn( + _log.debug( "should extract the content correctly for table-cells with lists ..." ) return cell.text diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index d50287f5..876a10e1 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -157,9 +157,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB new_list = None if is_a_list: - _log.info("LIST DETECTED!") + _log.debug("LIST DETECTED!") else: - _log.info("No List") + _log.debug("No List") # for e in p.iter(): for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}): diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index cc0e2613..54136fdd 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -138,7 +138,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): try: self.handle_tables(element, docx_obj, doc) except Exception: - _log.error("could not parse a table, broken docx table") + _log.debug("could not parse a table, broken docx table") elif found_drawing or found_pict: self.handle_pictures(element, docx_obj, doc) @@ -146,7 +146,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elif tag_name in ["p"]: self.handle_text_elements(element, docx_obj, doc) else: - _log.warn(f"Ignoring element in DOCX with tag: {tag_name}") + _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") return doc def str_to_int(self, s, default=0): diff --git a/docling/cli/main.py b/docling/cli/main.py index 6610cef2..f97e4938 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -5,7 +5,7 @@ import time import warnings from enum import Enum from pathlib import Path -from typing import Annotated, Iterable, List, Optional +from typing import Annotated, Dict, Iterable, List, Optional import typer from docling_core.utils.file import resolve_file_source @@ -26,7 +26,7 @@ from docling.datamodel.pipeline_options import ( TesseractCliOcrOptions, TesseractOcrOptions, ) -from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") @@ -152,6 +152,14 @@ def convert( ocr_engine: Annotated[ OcrEngine, typer.Option(..., help="The OCR engine to use.") ] = OcrEngine.EASYOCR, + abort_on_error: Annotated[ + bool, + typer.Option( + ..., + "--abort-on-error/--no-abort-on-error", + help="If enabled, the bitmap content will be processed using OCR.", + ), + ] = False, output: Annotated[ Path, typer.Option(..., help="Output directory where results are saved.") ] = Path("."), @@ -211,18 +219,22 @@ def convert( ) pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching + format_options: Dict[InputFormat, FormatOption] = { + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + backend=DoclingParseDocumentBackend, # pdf_backend + ) + } doc_converter = DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_options=pipeline_options, - backend=DoclingParseDocumentBackend, # pdf_backend - ) - } + allowed_formats=from_formats, + format_options=format_options, ) start_time = time.time() - conv_results = doc_converter.convert_all(input_doc_paths) + conv_results = doc_converter.convert_all( + input_doc_paths, raises_on_error=abort_on_error + ) output.mkdir(parents=True, exist_ok=True) export_documents( diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index bcc0254e..41d62114 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -160,7 +160,7 @@ class InputDocument(BaseModel): ) -> None: if backend is None: raise RuntimeError( - f"No backend configuration provided for file {self.file} with format {self.format}. " + f"No backend configuration provided for file {self.file.name} with format {self.format}. " f"Please check your format configuration on DocumentConverter." ) @@ -472,8 +472,8 @@ class _DocumentConversionInput(BaseModel): obj = resolve_file_source(item) if isinstance(item, str) else item format = self._guess_format(obj) if format not in format_options.keys(): - _log.debug( - f"Skipping input document {obj.name} because its format is not in the whitelist." + _log.info( + f"Skipping input document {obj.name} because it isn't matching any of the allowed formats." ) continue else: diff --git a/docling/document_converter.py b/docling/document_converter.py index 017a2096..a44dc9ce 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -111,6 +111,14 @@ class DocumentConverter: _log.debug(f"Requested format {f} will use default options.") self.format_to_options[f] = _format_to_default_options[f] + remove_keys = [] + for f in self.format_to_options.keys(): + if f not in self.allowed_formats: + remove_keys.append(f) + + for f in remove_keys: + self.format_to_options.pop(f) + self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {} @validate_call(config=ConfigDict(strict=True)) @@ -176,7 +184,7 @@ class DocumentConverter: # Note: PDF backends are not thread-safe, thread pool usage was disabled. for item in map( - partial(self.process_document, raises_on_error=raises_on_error), + partial(self._process_document, raises_on_error=raises_on_error), input_batch, ): if item is not None: @@ -205,22 +213,22 @@ class DocumentConverter: ) return self.initialized_pipelines[pipeline_class] - def process_document( + def _process_document( self, in_doc: InputDocument, raises_on_error: bool ) -> Optional[ConversionResult]: assert self.allowed_formats is not None + assert in_doc.format in self.allowed_formats - if in_doc.format not in self.allowed_formats: - return None - else: - start_doc_time = time.time() + start_doc_time = time.time() - conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) + conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) - end_doc_time = time.time() - start_doc_time - _log.info(f"Finished converting document in {end_doc_time:.2f} seconds.") + end_doc_time = time.time() - start_doc_time + _log.info( + f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds." + ) - return conv_res + return conv_res def _execute_pipeline( self, in_doc: InputDocument, raises_on_error: bool @@ -228,16 +236,25 @@ class DocumentConverter: if in_doc.valid: pipeline = self._get_pipeline(in_doc) if pipeline is None: # Can't find a default pipeline. Should this raise? - conv_res = ConversionResult(input=in_doc) - conv_res.status = ConversionStatus.FAILURE - return conv_res + if raises_on_error: + raise RuntimeError( + f"No pipeline could be initialized for {in_doc.file}." + ) + else: + conv_res = ConversionResult(input=in_doc) + conv_res.status = ConversionStatus.FAILURE + return conv_res conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error) else: - # invalid doc or not of desired format - conv_res = ConversionResult(input=in_doc) - conv_res.status = ConversionStatus.FAILURE - # TODO add error log why it failed. + if raises_on_error: + raise RuntimeError(f"Input document {in_doc.file} is not valid.") + + else: + # invalid doc or not of desired format + conv_res = ConversionResult(input=in_doc) + conv_res.status = ConversionStatus.FAILURE + # TODO add error log why it failed. return conv_res diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 5e26fe0c..8dd074cc 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -34,12 +34,6 @@ class BasePipeline(ABC): conv_res = ConversionResult(input=in_doc) _log.info(f"Processing document {in_doc.file.name}") - - if not in_doc.valid: - conv_res.status = ConversionStatus.FAILURE - return conv_res - - # TODO: propagate option for raises_on_error? try: # These steps are building and assembling the structure of the # output DoclingDocument @@ -155,7 +149,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. pass end_pb_time = time.time() - start_pb_time - _log.info(f"Finished converting page batch time={end_pb_time:.3f}") + _log.debug(f"Finished converting page batch time={end_pb_time:.3f}") except Exception as e: conv_res.status = ConversionStatus.FAILURE