Merge branch 'release_v3' of github.com:DS4SD/docling into cau/layout-postprocessing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-13 23:28:21 +00:00 · 2024-12-04 14:21:09 +01:00
parent 11c7c43bad 78fad801fe
commit e97688cd3d
27 changed files with 1581 additions and 835 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -2,6 +2,7 @@ import importlib
 import json
 import logging
 import re
+import tempfile
 import time
 import warnings
 from enum import Enum
@@ -9,7 +10,7 @@ from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type

 import typer
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_path

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -32,6 +33,7 @@ from docling.datamodel.pipeline_options import (
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
+from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -212,6 +214,24 @@ def convert(
            help="Set the verbosity level. -v for info logging, -vv for debug logging.",
        ),
    ] = 0,
+    debug_visualize_cells: Annotated[
+        bool,
+        typer.Option(..., help="Enable debug output which visualizes the PDF cells"),
+    ] = False,
+    debug_visualize_ocr: Annotated[
+        bool,
+        typer.Option(..., help="Enable debug output which visualizes the OCR cells"),
+    ] = False,
+    debug_visualize_layout: Annotated[
+        bool,
+        typer.Option(
+            ..., help="Enable debug output which visualizes the layour clusters"
+        ),
+    ] = False,
+    debug_visualize_tables: Annotated[
+        bool,
+        typer.Option(..., help="Enable debug output which visualizes the table cells"),
+    ] = False,
    version: Annotated[
        Optional[bool],
        typer.Option(
@@ -229,98 +249,106 @@ def convert(
    elif verbose == 2:
        logging.basicConfig(level=logging.DEBUG)

+    settings.debug.visualize_cells = debug_visualize_cells
+    settings.debug.visualize_layout = debug_visualize_layout
+    settings.debug.visualize_tables = debug_visualize_tables
+    settings.debug.visualize_ocr = debug_visualize_ocr
+
    if from_formats is None:
        from_formats = [e for e in InputFormat]

-    input_doc_paths: List[Path] = []
-    for src in input_sources:
-        source = resolve_file_source(source=src)
-        if not source.exists():
-            err_console.print(
-                f"[red]Error: The input file {source} does not exist.[/red]"
-            )
-            raise typer.Abort()
-        elif source.is_dir():
-            for fmt in from_formats:
-                for ext in FormatToExtensions[fmt]:
-                    input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
-                    input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
+    with tempfile.TemporaryDirectory() as tempdir:
+        input_doc_paths: List[Path] = []
+        for src in input_sources:
+            source = resolve_source_to_path(source=src, workdir=Path(tempdir))
+            if not source.exists():
+                err_console.print(
+                    f"[red]Error: The input file {source} does not exist.[/red]"
+                )
+                raise typer.Abort()
+            elif source.is_dir():
+                for fmt in from_formats:
+                    for ext in FormatToExtensions[fmt]:
+                        input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
+                        input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
+            else:
+                input_doc_paths.append(source)
+
+        if to_formats is None:
+            to_formats = [OutputFormat.MARKDOWN]
+
+        export_json = OutputFormat.JSON in to_formats
+        export_md = OutputFormat.MARKDOWN in to_formats
+        export_txt = OutputFormat.TEXT in to_formats
+        export_doctags = OutputFormat.DOCTAGS in to_formats
+
+        if ocr_engine == OcrEngine.EASYOCR:
+            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.TESSERACT_CLI:
+            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.TESSERACT:
+            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.OCRMAC:
+            ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.RAPIDOCR:
+            ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
        else:
-            input_doc_paths.append(source)
+            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

-    if to_formats is None:
-        to_formats = [OutputFormat.MARKDOWN]
+        ocr_lang_list = _split_list(ocr_lang)
+        if ocr_lang_list is not None:
+            ocr_options.lang = ocr_lang_list

-    export_json = OutputFormat.JSON in to_formats
-    export_md = OutputFormat.MARKDOWN in to_formats
-    export_txt = OutputFormat.TEXT in to_formats
-    export_doctags = OutputFormat.DOCTAGS in to_formats
-
-    if ocr_engine == OcrEngine.EASYOCR:
-        ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.TESSERACT_CLI:
-        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.TESSERACT:
-        ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.OCRMAC:
-        ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.RAPIDOCR:
-        ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
-    else:
-        raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
-
-    ocr_lang_list = _split_list(ocr_lang)
-    if ocr_lang_list is not None:
-        ocr_options.lang = ocr_lang_list
-
-    pipeline_options = PdfPipelineOptions(
-        do_ocr=ocr,
-        ocr_options=ocr_options,
-        do_table_structure=True,
-    )
-    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
-    pipeline_options.table_structure_options.mode = table_mode
-
-    if artifacts_path is not None:
-        pipeline_options.artifacts_path = artifacts_path
-
-    if pdf_backend == PdfBackend.DLPARSE_V1:
-        backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
-    elif pdf_backend == PdfBackend.DLPARSE_V2:
-        backend = DoclingParseV2DocumentBackend
-    elif pdf_backend == PdfBackend.PYPDFIUM2:
-        backend = PyPdfiumDocumentBackend
-    else:
-        raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
-
-    format_options: Dict[InputFormat, FormatOption] = {
-        InputFormat.PDF: PdfFormatOption(
-            pipeline_options=pipeline_options,
-            backend=backend,  # pdf_backend
+        pipeline_options = PdfPipelineOptions(
+            do_ocr=ocr,
+            ocr_options=ocr_options,
+            do_table_structure=True,
        )
-    }
-    doc_converter = DocumentConverter(
-        allowed_formats=from_formats,
-        format_options=format_options,
-    )
+        pipeline_options.table_structure_options.do_cell_matching = (
+            True  # do_cell_matching
+        )
+        pipeline_options.table_structure_options.mode = table_mode

-    start_time = time.time()
+        if artifacts_path is not None:
+            pipeline_options.artifacts_path = artifacts_path

-    conv_results = doc_converter.convert_all(
-        input_doc_paths, raises_on_error=abort_on_error
-    )
+        if pdf_backend == PdfBackend.DLPARSE_V1:
+            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+        elif pdf_backend == PdfBackend.DLPARSE_V2:
+            backend = DoclingParseV2DocumentBackend
+        elif pdf_backend == PdfBackend.PYPDFIUM2:
+            backend = PyPdfiumDocumentBackend
+        else:
+            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

-    output.mkdir(parents=True, exist_ok=True)
-    export_documents(
-        conv_results,
-        output_dir=output,
-        export_json=export_json,
-        export_md=export_md,
-        export_txt=export_txt,
-        export_doctags=export_doctags,
-    )
+        format_options: Dict[InputFormat, FormatOption] = {
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=backend,  # pdf_backend
+            )
+        }
+        doc_converter = DocumentConverter(
+            allowed_formats=from_formats,
+            format_options=format_options,
+        )

-    end_time = time.time() - start_time
+        start_time = time.time()
+
+        conv_results = doc_converter.convert_all(
+            input_doc_paths, raises_on_error=abort_on_error
+        )
+
+        output.mkdir(parents=True, exist_ok=True)
+        export_documents(
+            conv_results,
+            output_dir=output,
+            export_json=export_json,
+            export_md=export_md,
+            export_txt=export_txt,
+            export_doctags=export_doctags,
+        )
+
+        end_time = time.time() - start_time

    _log.info(f"All documents were converted in {end_time:.2f} seconds.")

--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,5 +1,4 @@
 from enum import Enum, auto
-from io import BytesIO
 from typing import TYPE_CHECKING, Dict, List, Optional, Union

 from docling_core.types.doc import (
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
    Size,
    TableCell,
 )
+from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
+    DocumentStream,
+)
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict

@@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
    FAILURE = auto()
    SUCCESS = auto()
    PARTIAL_SUCCESS = auto()
+    SKIPPED = auto()


 class InputFormat(str, Enum):
@@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
    DOCUMENT_BACKEND = auto()
    MODEL = auto()
    DOC_ASSEMBLER = auto()
+    USER_INPUT = auto()


 class ErrorItem(BaseModel):
@@ -214,10 +218,3 @@ class Page(BaseModel):
    @property
    def image(self) -> Optional[Image]:
        return self.get_image(scale=self._default_image_scale)
-
-
-class DocumentStream(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    name: str
-    stream: BytesIO
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -3,7 +3,7 @@ import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union

 import filetype
 from docling_core.types.doc import (
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
 )
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_stream
 from pydantic import BaseModel
 from typing_extensions import deprecated

@@ -166,12 +166,6 @@ class InputDocument(BaseModel):
        backend: Type[AbstractDocumentBackend],
        path_or_stream: Union[BytesIO, Path],
    ) -> None:
-        if backend is None:
-            raise RuntimeError(
-                f"No backend configuration provided for file {self.file.name} with format {self.format}. "
-                f"Please check your format configuration on DocumentConverter."
-            )
-
        self._backend = backend(self, path_or_stream=path_or_stream)
        if not self._backend.is_valid():
            self.valid = False
@@ -452,6 +446,25 @@ class ConversionResult(BaseModel):
        return ds_doc


+class _DummyBackend(AbstractDocumentBackend):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def is_valid(self) -> bool:
+        return False
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return set()
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        return super().unload()
+
+
 class _DocumentConversionInput(BaseModel):

    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@@ -461,13 +474,14 @@ class _DocumentConversionInput(BaseModel):
        self, format_options: Dict[InputFormat, "FormatOption"]
    ) -> Iterable[InputDocument]:
        for item in self.path_or_stream_iterator:
-            obj = resolve_file_source(item) if isinstance(item, str) else item
+            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
            format = self._guess_format(obj)
+            backend: Type[AbstractDocumentBackend]
            if format not in format_options.keys():
-                _log.info(
-                    f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
+                _log.error(
+                    f"Input document {obj.name} does not match any allowed format."
                )
-                continue
+                backend = _DummyBackend
            else:
                backend = format_options[format].backend

--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field


 class TableFormerMode(str, Enum):
+    """Modes for the TableFormer model."""
+
    FAST = "fast"
    ACCURATE = "accurate"


 class TableStructureOptions(BaseModel):
+    """Options for the table structure."""
+
    do_cell_matching: bool = (
        True
        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):


 class OcrOptions(BaseModel):
+    """OCR options."""
+
    kind: str
    lang: List[str]
    force_full_page_ocr: bool = False  # If enabled a full page OCR is always applied
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):


 class RapidOcrOptions(OcrOptions):
+    """Options for the RapidOCR engine."""
+
    kind: Literal["rapidocr"] = "rapidocr"

    # English and chinese are the most commly used models and have been tested with RapidOCR.
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):


 class EasyOcrOptions(OcrOptions):
+    """Options for the EasyOCR engine."""
+
    kind: Literal["easyocr"] = "easyocr"
    lang: List[str] = ["fr", "de", "es", "en"]
    use_gpu: bool = True  # same default as easyocr.Reader
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):


 class TesseractCliOcrOptions(OcrOptions):
+    """Options for the TesseractCli engine."""
+
    kind: Literal["tesseract"] = "tesseract"
    lang: List[str] = ["fra", "deu", "spa", "eng"]
    tesseract_cmd: str = "tesseract"
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):


 class TesseractOcrOptions(OcrOptions):
+    """Options for the Tesseract engine."""
+
    kind: Literal["tesserocr"] = "tesserocr"
    lang: List[str] = ["fra", "deu", "spa", "eng"]
    path: Optional[str] = None
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):


 class OcrMacOptions(OcrOptions):
+    """Options for the Mac OCR engine."""
+
    kind: Literal["ocrmac"] = "ocrmac"
    lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
    recognition: str = "accurate"
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):


 class PipelineOptions(BaseModel):
+    """Base pipeline options."""
+
    create_legacy_output: bool = (
        True  # This defautl will be set to False on a future version of docling
    )


 class PdfPipelineOptions(PipelineOptions):
+    """Options for the PDF pipeline."""
+
    artifacts_path: Optional[Union[Path, str]] = None
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    DoclingComponentType,
+    DocumentStream,
+    ErrorItem,
+    InputFormat,
+)
 from docling.datamodel.document import (
    ConversionResult,
    InputDocument,
@@ -23,6 +29,7 @@ from docling.datamodel.document import (
 )
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import DocumentLimits, settings
+from docling.exceptions import ConversionError
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend


-_format_to_default_options = {
-    InputFormat.XLSX: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
-    ),
-    InputFormat.DOCX: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
-    ),
-    InputFormat.PPTX: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
-    ),
-    InputFormat.MD: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
-    ),
-    InputFormat.ASCIIDOC: FormatOption(
-        pipeline_cls=SimplePipeline, backend=AsciiDocBackend
-    ),
-    InputFormat.HTML: FormatOption(
-        pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
-    ),
-    InputFormat.IMAGE: FormatOption(
-        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
-    ),
-    InputFormat.PDF: FormatOption(
-        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
-    ),
-}
+def _get_default_option(format: InputFormat) -> FormatOption:
+    format_to_default_options = {
+        InputFormat.XLSX: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
+        ),
+        InputFormat.DOCX: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
+        ),
+        InputFormat.PPTX: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
+        ),
+        InputFormat.MD: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
+        ),
+        InputFormat.ASCIIDOC: FormatOption(
+            pipeline_cls=SimplePipeline, backend=AsciiDocBackend
+        ),
+        InputFormat.HTML: FormatOption(
+            pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
+        ),
+        InputFormat.IMAGE: FormatOption(
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+        ),
+        InputFormat.PDF: FormatOption(
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+        ),
+    }
+    if (options := format_to_default_options.get(format)) is not None:
+        return options
+    else:
+        raise RuntimeError(f"No default options configured for {format}")


 class DocumentConverter:
@@ -121,36 +133,26 @@ class DocumentConverter:
        allowed_formats: Optional[List[InputFormat]] = None,
        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
    ):
-        self.allowed_formats = allowed_formats
-        self.format_to_options = format_options
-
-        if self.allowed_formats is None:
-            # if self.format_to_options is not None:
-            #    self.allowed_formats = self.format_to_options.keys()
-            # else:
-            self.allowed_formats = [e for e in InputFormat]  # all formats
-
-        if self.format_to_options is None:
-            self.format_to_options = _format_to_default_options
-        else:
-            for f in self.allowed_formats:
-                if f not in self.format_to_options.keys():
-                    _log.debug(f"Requested format {f} will use default options.")
-                    self.format_to_options[f] = _format_to_default_options[f]
-
-            remove_keys = []
-            for f in self.format_to_options.keys():
-                if f not in self.allowed_formats:
-                    remove_keys.append(f)
-
-            for f in remove_keys:
-                self.format_to_options.pop(f)
-
+        self.allowed_formats = (
+            allowed_formats if allowed_formats is not None else [e for e in InputFormat]
+        )
+        self.format_to_options = {
+            format: (
+                _get_default_option(format=format)
+                if (custom_option := (format_options or {}).get(format)) is None
+                else custom_option
+            )
+            for format in self.allowed_formats
+        }
        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}

    def initialize_pipeline(self, format: InputFormat):
        """Initialize the conversion pipeline for the selected format."""
-        self._get_pipeline(doc_format=format)
+        pipeline = self._get_pipeline(doc_format=format)
+        if pipeline is None:
+            raise ConversionError(
+                f"No pipeline could be initialized for format {format}"
+            )

    @validate_call(config=ConfigDict(strict=True))
    def convert(
@@ -186,22 +188,28 @@ class DocumentConverter:
            limits=limits,
        )
        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
+
+        had_result = False
        for conv_res in conv_res_iter:
+            had_result = True
            if raises_on_error and conv_res.status not in {
                ConversionStatus.SUCCESS,
                ConversionStatus.PARTIAL_SUCCESS,
            }:
-                raise RuntimeError(
+                raise ConversionError(
                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
                )
            else:
                yield conv_res

+        if not had_result and raises_on_error:
+            raise ConversionError(
+                f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
+            )
+
    def _convert(
        self, conv_input: _DocumentConversionInput, raises_on_error: bool
    ) -> Iterator[ConversionResult]:
-        assert self.format_to_options is not None
-
        start_time = time.monotonic()

        for input_batch in chunkify(
@@ -223,27 +231,22 @@ class DocumentConverter:
            ):
                elapsed = time.monotonic() - start_time
                start_time = time.monotonic()
-
-                if item is not None:
-                    _log.info(
-                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
-                    )
-                    yield item
-                else:
-                    _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
+                _log.info(
+                    f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
+                )
+                yield item

    def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
-        assert self.format_to_options is not None
-
        fopt = self.format_to_options.get(doc_format)

        if fopt is None:
-            raise RuntimeError(f"Could not get pipeline for {doc_format}")
+            return None
        else:
            pipeline_class = fopt.pipeline_cls
            pipeline_options = fopt.pipeline_options

-        assert pipeline_options is not None
+        if pipeline_options is None:
+            return None
        # TODO this will ignore if different options have been defined for the same pipeline class.
        if (
            pipeline_class not in self.initialized_pipelines
@@ -257,11 +260,26 @@ class DocumentConverter:

    def _process_document(
        self, in_doc: InputDocument, raises_on_error: bool
-    ) -> Optional[ConversionResult]:
-        assert self.allowed_formats is not None
-        assert in_doc.format in self.allowed_formats
+    ) -> ConversionResult:

-        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
+        valid = (
+            self.allowed_formats is not None and in_doc.format in self.allowed_formats
+        )
+        if valid:
+            conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
+        else:
+            error_message = f"File format not allowed: {in_doc.file}"
+            if raises_on_error:
+                raise ConversionError(error_message)
+            else:
+                error_item = ErrorItem(
+                    component_type=DoclingComponentType.USER_INPUT,
+                    module_name="",
+                    error_message=error_message,
+                )
+                conv_res = ConversionResult(
+                    input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
+                )

        return conv_res

@@ -270,26 +288,28 @@ class DocumentConverter:
    ) -> ConversionResult:
        if in_doc.valid:
            pipeline = self._get_pipeline(in_doc.format)
-            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+            if pipeline is not None:
+                conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
+            else:
                if raises_on_error:
-                    raise RuntimeError(
+                    raise ConversionError(
                        f"No pipeline could be initialized for {in_doc.file}."
                    )
                else:
-                    conv_res = ConversionResult(input=in_doc)
-                    conv_res.status = ConversionStatus.FAILURE
-                    return conv_res
-
-            conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
-
+                    conv_res = ConversionResult(
+                        input=in_doc,
+                        status=ConversionStatus.FAILURE,
+                    )
        else:
            if raises_on_error:
-                raise RuntimeError(f"Input document {in_doc.file} is not valid.")
+                raise ConversionError(f"Input document {in_doc.file} is not valid.")

            else:
                # invalid doc or not of desired format
-                conv_res = ConversionResult(input=in_doc)
-                conv_res.status = ConversionStatus.FAILURE
+                conv_res = ConversionResult(
+                    input=in_doc,
+                    status=ConversionStatus.FAILURE,
+                )
                # TODO add error log why it failed.

        return conv_res
--- a/docling/exceptions.py
+++ b/docling/exceptions.py
@@ -0,0 +1,6 @@
+class BaseError(RuntimeError):
+    pass
+
+
+class ConversionError(BaseError):
+    pass
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -1,5 +1,7 @@
+import csv
 import io
 import logging
+import os
 import tempfile
 from subprocess import DEVNULL, PIPE, Popen
 from typing import Iterable, Optional, Tuple
@@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
        # _log.info(decoded_data)

        # Read the TSV file generated by Tesseract
-        df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
+        df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")

        # Display the dataframe (optional)
        # _log.info("df: ", df.head())
@@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
                        high_res_image = page._backend.get_page_image(
                            scale=self.scale, cropbox=ocr_rect
                        )
-
-                        with tempfile.NamedTemporaryFile(
-                            suffix=".png", mode="w"
-                        ) as image_file:
-                            fname = image_file.name
-                            high_res_image.save(fname)
+                        try:
+                            with tempfile.NamedTemporaryFile(
+                                suffix=".png", mode="w+b", delete=False
+                            ) as image_file:
+                                fname = image_file.name
+                                high_res_image.save(image_file)

                            df = self._run_tesseract(fname)
+                        finally:
+                            if os.path.exists(fname):
+                                os.remove(fname)

                        # _log.info(df)