Fundamental refactoring for multi-format support

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-01 16:27:22 +02:00 · 2024-10-01 16:27:22 +02:00 · 1fa7cd9855
commit 1fa7cd9855
parent cd06d89c2a
34 changed files with 2102 additions and 365 deletions
--- a/README.md
+++ b/README.md
@ -67,11 +67,12 @@ pip install docling
 ### Convert a single document
 To convert invidual PDF documents, use `convert_single()`, for example:
 ```python
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
-converter = DocumentConverter()
+converter = PdfDocumentConverter()
 result = converter.convert_single(source)
 print(result.render_as_markdown())  # output: "## Docling Technical Report[...]"
 print(result.render_as_doctags())  # output: "<document><title><page_1><loc_20>..."
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@ -1,13 +1,11 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
+from typing import Set, Union
-from docling_core.types.experimental import BoundingBox, Size
+from docling_core.types.experimental import DoclingDocument
 from PIL import Image
-if TYPE_CHECKING:
+from docling.datamodel.base_models import InputFormat
    from docling.datamodel.base_models import Cell
 class AbstractDocumentBackend(ABC):
@ -20,6 +18,11 @@ class AbstractDocumentBackend(ABC):
    def is_valid(self) -> bool:
        pass
    @classmethod
    @abstractmethod
    def is_paginated(cls) -> bool:
        pass
    @abstractmethod
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
@ -27,45 +30,19 @@ class AbstractDocumentBackend(ABC):
        self.path_or_stream = None
-
+    @classmethod
 class PdfPageBackend(ABC):
    @abstractmethod
-    def get_text_in_rect(self, bbox: "BoundingBox") -> str:
+    def supported_formats(cls) -> Set[InputFormat]:
        pass
    @abstractmethod
    def get_text_cells(self) -> Iterable["Cell"]:
        pass
    @abstractmethod
    def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
        pass
    @abstractmethod
    def get_page_image(
        self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
    ) -> Image.Image:
        pass
    @abstractmethod
    def get_size(self) -> "Size":
        pass
    @abstractmethod
    def is_valid(self) -> bool:
        pass
    @abstractmethod
    def unload(self):
        pass
-class PdfDocumentBackend(AbstractDocumentBackend):
+class DeclarativeDocumentBackend(AbstractDocumentBackend):
-    @abstractmethod
+    """DeclarativeDocumentBackend.
-    def load_page(self, page_no: int) -> PdfPageBackend:
+
-        pass
+    A declarative document backend is a backend that can transform to DoclingDocument
    straight without a recognition pipeline.
    """
    @abstractmethod
-    def page_count(self) -> int:
+    def convert(self) -> DoclingDocument:
        pass
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -10,7 +10,7 @@ from docling_parse.docling_parse import pdf_parser
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import Cell
 _log = logging.getLogger(__name__)
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -0,0 +1,40 @@
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
 from docling_core.types.experimental import (
    DescriptionItem,
    DocItemLabel,
    DoclingDocument,
 )
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
        super().__init__(path_or_stream, document_hash)
    def is_valid(self) -> bool:
        return True
    def is_paginated(cls) -> bool:
        False
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
        self.path_or_stream = None
    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return {InputFormat.HTML}
    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
        doc.add_text(text="I am a HTML document.", label=DocItemLabel.TEXT)
        return doc
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -0,0 +1,38 @@
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
 from docling_core.types.experimental import (
    DescriptionItem,
    DocItemLabel,
    DoclingDocument,
 )
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
 class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
        super().__init__(path_or_stream, document_hash)
    def is_valid(self) -> bool:
        return True
    def is_paginated(cls) -> bool:
        False
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
        self.path_or_stream = None
    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return {InputFormat.PPTX}
    def convert(self) -> DoclingDocument:
        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
        doc.add_text(text="I am a Powerpoint document.", label=DocItemLabel.TEXT)
        return doc
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -0,0 +1,38 @@
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
 from docling_core.types.experimental import (
    DescriptionItem,
    DocItemLabel,
    DoclingDocument,
 )
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
 class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
        super().__init__(path_or_stream, document_hash)
    def is_valid(self) -> bool:
        return True
    def is_paginated(cls) -> bool:
        False
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
        self.path_or_stream = None
    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return {InputFormat.DOCX}
    def convert(self) -> DoclingDocument:
        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
        doc.add_text(text="I am a Word document.", label=DocItemLabel.TEXT)
        return doc
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@ -0,0 +1,59 @@
 from abc import ABC, abstractmethod
 from typing import Iterable, Optional, Set
 from docling_core.types.experimental import BoundingBox, Size
 from PIL import Image
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.datamodel.base_models import Cell, InputFormat
 class PdfPageBackend(ABC):
    @abstractmethod
    def get_text_in_rect(self, bbox: "BoundingBox") -> str:
        pass
    @abstractmethod
    def get_text_cells(self) -> Iterable["Cell"]:
        pass
    @abstractmethod
    def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
        pass
    @abstractmethod
    def get_page_image(
        self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
    ) -> Image.Image:
        pass
    @abstractmethod
    def get_size(self) -> "Size":
        pass
    @abstractmethod
    def is_valid(self) -> bool:
        pass
    @abstractmethod
    def unload(self):
        pass
 class PdfDocumentBackend(AbstractDocumentBackend):
    @abstractmethod
    def load_page(self, page_no: int) -> PdfPageBackend:
        pass
    @abstractmethod
    def page_count(self) -> int:
        pass
    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return {InputFormat.PDF}
    @classmethod
    def is_paginated(cls) -> bool:
        return True
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -8,10 +8,10 @@ import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
 from PIL import Image, ImageDraw
-from pypdfium2 import PdfPage, PdfTextPage
+from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import Cell
 _log = logging.getLogger(__name__)
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -12,9 +12,9 @@ from docling_core.utils.file import resolve_file_source
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -190,12 +190,12 @@ def convert(
        case _:
            raise RuntimeError(f"Unexpected backend type {backend}")
-    pipeline_options = PipelineOptions(
+    pipeline_options = PdfPipelineOptions(
        do_ocr=ocr,
        do_table_structure=True,
    )
    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
-    doc_converter = DocumentConverter(
+    doc_converter = PdfDocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=pdf_backend,
    )
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -1,8 +1,8 @@
 import copy
 import warnings
 from enum import Enum, auto
 from io import BytesIO
-from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
+from pathlib import Path
 from typing import Annotated, Dict, List, Optional, Union
 from docling_core.types.experimental import BoundingBox, Size
 from docling_core.types.experimental.document import BasePictureData, TableCell
@ -11,8 +11,6 @@ from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Self
 from docling.backend.abstract_backend import PdfPageBackend
 class ConversionStatus(str, Enum):
    PENDING = auto()
@ -30,13 +28,29 @@ class InputFormat(str, Enum):
    PDF = auto()
 FormatToMimeType = {
    InputFormat.DOCX: {
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    },
    InputFormat.PPTX: {
        "application/vnd.openxmlformats-officedocument.presentationml.presentation"
    },
    InputFormat.HTML: {"text/html", "application/xhtml+xml"},
    InputFormat.IMAGE: {"image/png", "image/jpeg"},
    InputFormat.PDF: {"application/pdf"},
 }
 MimeTypeToFormat = {
    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
 }
 class DocInputType(str, Enum):
    PATH = auto()
    STREAM = auto()
 class DoclingComponentType(str, Enum):
-    PDF_BACKEND = auto()
+    DOCUMENT_BACKEND = auto()
    MODEL = auto()
    DOC_ASSEMBLER = auto()
@ -128,13 +142,13 @@ class Page(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    page_no: int
-    page_hash: Optional[str] = None
+    # page_hash: Optional[str] = None
    size: Optional[Size] = None
    cells: List[Cell] = []
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None
-    _backend: Optional[PdfPageBackend] = (
+    _backend: Optional["PdfPageBackend"] = (
        None  # Internal PDF backend. By default it is cleared during assembling.
    )
    _default_image_scale: float = 1.0  # Default image scale for external usage.
@ -170,14 +184,16 @@ class TableStructureOptions(BaseModel):
    )
-class PipelineOptions(BaseModel):
+class PipelineOptions(BaseModel): ...
 class PdfPipelineOptions(PipelineOptions):
    artifacts_path: Optional[Union[Path, str]] = None
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
    table_structure_options: TableStructureOptions = TableStructureOptions()
 class AssembleOptions(BaseModel):
    keep_page_images: Annotated[
        bool,
        Field(
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -1,9 +1,10 @@
 import logging
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
+from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
-from docling_core.types import BaseCell, BaseText
+import filetype
 from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
@ -19,8 +20,11 @@ from docling_core.types.experimental import (
 from pydantic import BaseModel
 from typing_extensions import deprecated
-from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
+from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import (
    AssembledUnit,
    ConversionStatus,
@ -28,13 +32,14 @@ from docling.datamodel.base_models import (
    ErrorItem,
    FigureElement,
    InputFormat,
    MimeTypeToFormat,
    Page,
    PageElement,
    Table,
    TextElement,
 )
 from docling.datamodel.settings import DocumentLimits
-from docling.utils.utils import create_file_hash
+from docling.utils.utils import create_file_hash, create_hash
 _log = logging.getLogger(__name__)
@ -71,8 +76,9 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
 _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
    InputFormat.PDF: DoclingParseDocumentBackend,
-    InputFormat.DOCX: None,
+    InputFormat.HTML: HTMLDocumentBackend,
-    InputFormat.PPTX: None,
+    InputFormat.DOCX: MsWordDocumentBackend,
    InputFormat.PPTX: MsPowerpointDocumentBackend,
    InputFormat.IMAGE: None,
 }
@ -80,13 +86,14 @@ _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]]
 class InputDocument(BaseModel):
    file: PurePath = None
    document_hash: Optional[str] = None
-    valid: bool = False
+    valid: bool = True
    limits: DocumentLimits = DocumentLimits()
    format: Optional[InputFormat] = None
    filesize: Optional[int] = None
-    page_count: Optional[int] = None
+    page_count: int = 0
-    _backend: PdfDocumentBackend = None  # Internal PDF backend used
+    _backend: AbstractDocumentBackend = None  # Internal PDF backend used
    def __init__(
        self,
@ -94,27 +101,31 @@ class InputDocument(BaseModel):
        filename: Optional[str] = None,
        limits: Optional[DocumentLimits] = None,
        backend: Optional[Type[AbstractDocumentBackend]] = None,
        format: Optional[InputFormat] = None,
    ):
        super().__init__()
        if not backend:
            backend = _input_format_default_backends[InputFormat.PDF]
        self.limits = limits or DocumentLimits()
        try:
            if isinstance(path_or_stream, Path):
                mime = filetype.guess_mime(str(path_or_stream))
                if mime is None:
                    if path_or_stream.suffix == ".html":
                        mime = "text/html"
                self.file = path_or_stream
                self.filesize = path_or_stream.stat().st_size
                if self.filesize > self.limits.max_file_size:
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = backend(
+
-                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    self._init_doc(backend, mime, path_or_stream)
                    )
            elif isinstance(path_or_stream, BytesIO):
                mime = filetype.guess_mime(path_or_stream.read(8192))
                self.file = PurePath(filename)
                self.filesize = path_or_stream.getbuffer().nbytes
@ -122,15 +133,15 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
                    self._backend = backend(
                        path_or_stream=path_or_stream, document_hash=self.document_hash
                    )
-            if self.document_hash and self._backend.page_count() > 0:
+                    self._init_doc(backend, mime, path_or_stream)
                self.page_count = self._backend.page_count()
-                if self.page_count <= self.limits.max_num_pages:
+            # For paginated backends, check if the maximum page count is exceeded.
-                    self.valid = True
+            if self.valid and self._backend.is_valid():
                if self._backend.is_paginated():
                    self.page_count = self._backend.page_count()
                    if not self.page_count <= self.limits.max_num_pages:
                        self.valid = False
        except (FileNotFoundError, OSError) as e:
            _log.exception(
@ -144,6 +155,27 @@ class InputDocument(BaseModel):
            )
            # raise
    def _init_doc(
        self,
        backend: AbstractDocumentBackend,
        mime: str,
        path_or_stream: Union[BytesIO, Path],
    ) -> None:
        self.format = MimeTypeToFormat.get(mime)
        if self.format is not None:
            backend = backend or _input_format_default_backends.get(self.format)
            if backend is None:
                raise RuntimeError(
                    f"Could not find suitable default backend for format: {self.format}"
                )
        if self.format is None or self.format not in backend.supported_formats():
            # TODO decide if to raise exception here too.
            self.valid = False
        else:
            self._backend = backend(
                path_or_stream=path_or_stream, document_hash=self.document_hash
            )
@deprecated("Use `ConversionResult` instead.")
 class ConvertedDocument(BaseModel):
@ -163,7 +195,11 @@ class ConvertedDocument(BaseModel):
        desc = DsDocumentDescription(logs=[])
        page_hashes = [
-            PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
+            PageReference(
                hash=create_hash(self.input.document_hash + ":" + str(p.page_no)),
                page=p.page_no + 1,
                model="default",
            )
            for p in self.pages
        ]
@ -441,25 +477,21 @@ class DocumentConversionInput(BaseModel):
    _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
    limits: Optional[DocumentLimits] = DocumentLimits()
    DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
    def docs(
-        self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
+        self, backend: Optional[Type[AbstractDocumentBackend]] = None
    ) -> Iterable[InputDocument]:
        pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
        for obj in self._path_or_stream_iterator:
            if isinstance(obj, Path):
                yield InputDocument(
-                    path_or_stream=obj, limits=self.limits, backend=pdf_backend
+                    path_or_stream=obj, limits=self.limits, backend=backend
                )
            elif isinstance(obj, DocumentStream):
                yield InputDocument(
                    path_or_stream=obj.stream,
                    filename=obj.filename,
                    limits=self.limits,
-                    backend=pdf_backend,
+                    backend=backend,
                )
    @classmethod
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -1,81 +1,78 @@
 import functools
 import logging
 import tempfile
 import time
 import traceback
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
+from typing import Dict, Iterable, List, Optional, Type
 import requests
-from PIL import ImageDraw
+from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
 from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
-from docling.backend.abstract_backend import PdfDocumentBackend
+from docling.backend.abstract_backend import AbstractDocumentBackend
-from docling.datamodel.base_models import (
+from docling.datamodel.base_models import ConversionStatus, InputFormat, PipelineOptions
    AssembledUnit,
    AssembleOptions,
    ConversionStatus,
    DoclingComponentType,
    ErrorItem,
    Page,
    PipelineOptions,
 )
 from docling.datamodel.document import (
    ConversionResult,
    DocumentConversionInput,
    InputDocument,
 )
 from docling.datamodel.settings import settings
 from docling.models.ds_glm_model import GlmModel
 from docling.models.page_assemble_model import PageAssembleModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
-from docling.pipeline.standard_model_pipeline import StandardModelPipeline
+from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
-from docling.utils.utils import chunkify, create_hash
+from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
 from docling.utils.utils import chunkify
 _log = logging.getLogger(__name__)
-class DocumentConverter:
+class FormatOption(BaseModel):
-    _default_download_filename = "file.pdf"
+    pipeline_cls: Type[BaseModelPipeline]
    pipeline_options: Optional[PipelineOptions] = None
    backend: Optional[Type[AbstractDocumentBackend]]
    model_config = ConfigDict(arbitrary_types_allowed=True)
    def __init__(
        self,
-        artifacts_path: Optional[Union[Path, str]] = None,
+        pipeline_cls: Type[BaseModelPipeline],
-        pipeline_options: PipelineOptions = PipelineOptions(),
+        pipeline_options: Optional[PipelineOptions] = None,
-        pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
+        backend: Optional[Type[AbstractDocumentBackend]] = None,
        pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
        assemble_options: AssembleOptions = AssembleOptions(),
    ):
-        if not artifacts_path:
+        if pipeline_options is None:
-            artifacts_path = self.download_models_hf()
+            pipeline_options = pipeline_cls.get_default_options()
-        artifacts_path = Path(artifacts_path)
+        super().__init__(
-
+            pipeline_cls=pipeline_cls,
-        self.model_pipeline = pipeline_cls(
+            pipeline_options=pipeline_options,
-            artifacts_path=artifacts_path, pipeline_options=pipeline_options
+            backend=backend,
        )
        self.page_assemble_model = PageAssembleModel(config={})
        self.glm_model = GlmModel(config={})
        self.pdf_backend = pdf_backend
        self.assemble_options = assemble_options
-    @staticmethod
+_format_to_default_options = {
-    def download_models_hf(
+    InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
-        local_dir: Optional[Path] = None, force: bool = False
+    InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
-    ) -> Path:
+    InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
-        from huggingface_hub import snapshot_download
+    InputFormat.IMAGE: None,
    InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
 }
-        download_path = snapshot_download(
+
-            repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
+class DocumentConverter:
    _default_download_filename = "file"
    def __init__(
        self,
        formats: List[InputFormat] = [e for e in InputFormat],
        format_options: Dict[InputFormat, FormatOption] = _format_to_default_options,
    ):
        self.formats = formats
        self.format_to_options = format_options
        self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
            {}
        )
        return Path(download_path)
    def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
        for input_batch in chunkify(
-            input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
+            input.docs(), settings.perf.doc_batch_size  # pass format_options
        ):
            _log.info(f"Going to convert document batch...")
            # parallel processing only within input_batch
@ -84,8 +81,8 @@ class DocumentConverter:
            # ) as pool:
            #   yield from pool.map(self.process_document, input_batch)
-            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
+            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
-            yield from map(self._process_document, input_batch)
+            yield from map(self.process_document, input_batch)
    def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
        """Convert a single document.
@ -137,156 +134,42 @@ class DocumentConverter:
            raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
        return conv_res
-    def _process_document(self, in_doc: InputDocument) -> ConversionResult:
+    def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
-        start_doc_time = time.time()
+        pipeline_class = None
-        conv_res = ConversionResult(input=in_doc)
+        fopt = self.format_to_options.get(doc.format)
        if fopt is None:
            return None
        else:
            pipeline_class = fopt.pipeline_cls
-        _log.info(f"Processing document {in_doc.file.name}")
+        if pipeline_class not in self.initialized_pipelines:
-
+            self.initialized_pipelines[pipeline_class] = pipeline_class(
-        if not in_doc.valid:
+                pipeline_options=pipeline_class.get_default_options()
            conv_res.status = ConversionStatus.FAILURE
            return conv_res
        for i in range(0, in_doc.page_count):
            conv_res.pages.append(Page(page_no=i))
        all_assembled_pages = []
        try:
            # Iterate batches of pages (page_batch_size) in the doc
            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
                start_pb_time = time.time()
                # Pipeline
                # 1. Initialise the page resources
                init_pages = map(
                    functools.partial(self._initialize_page, in_doc), page_batch
                )
                # 2. Populate page image
                pages_with_images = map(
                    functools.partial(self._populate_page_images, in_doc), init_pages
                )
                # 3. Populate programmatic page cells
                pages_with_cells = map(
                    functools.partial(self._parse_page_cells, in_doc),
                    pages_with_images,
                )
                # 4. Run pipeline stages
                pipeline_pages = self.model_pipeline.apply(pages_with_cells)
                # 5. Assemble page elements (per page)
                assembled_pages = self.page_assemble_model(pipeline_pages)
                # exhaust assembled_pages
                for assembled_page in assembled_pages:
                    # Free up mem resources before moving on with next batch
                    # Remove page images (can be disabled)
                    if self.assemble_options.images_scale is None:
                        assembled_page._image_cache = {}
                    # Unload backend
                    assembled_page._backend.unload()
                    all_assembled_pages.append(assembled_page)
                end_pb_time = time.time() - start_pb_time
                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
            # Free up mem resources of PDF backend
            in_doc._backend.unload()
            conv_res.pages = all_assembled_pages
            self._assemble_doc(conv_res)
            status = ConversionStatus.SUCCESS
            for page in conv_res.pages:
                if not page._backend.is_valid():
                    conv_res.errors.append(
                        ErrorItem(
                            component_type=DoclingComponentType.PDF_BACKEND,
                            module_name=type(page._backend).__name__,
                            error_message=f"Page {page.page_no} failed to parse.",
                        )
                    )
                    status = ConversionStatus.PARTIAL_SUCCESS
            conv_res.status = status
        except Exception as e:
            conv_res.status = ConversionStatus.FAILURE
            trace = "\n".join(traceback.format_exception(e))
            _log.info(
                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
                f"{trace}"
            )
        return self.initialized_pipelines[pipeline_class]
    def process_document(self, in_doc: InputDocument) -> ConversionResult:
        start_doc_time = time.time()
        conv_res = self._execute_pipeline(in_doc)
        end_doc_time = time.time() - start_doc_time
-        _log.info(
+        _log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
            f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
        )
        return conv_res
-    # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
+    def _execute_pipeline(self, in_doc: InputDocument) -> ConversionResult:
-    def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
+        if in_doc.valid and in_doc.format in self.formats:
-        page._backend = doc._backend.load_page(page.page_no)
+            pipeline = self._get_pipeline(in_doc)
-        page.size = page._backend.get_size()
+            if pipeline is None:  # Can't find a default pipeline. Should this raise?
-        page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
+                conv_res = ConversionResult(input=in_doc)
                conv_res.status = ConversionStatus.FAILURE
                return conv_res
-        return page
+            conv_res = pipeline.execute(in_doc)
        else:  # invalid doc or not of desired format
            conv_res = ConversionResult(input=in_doc)
            conv_res.status = ConversionStatus.FAILURE
            # TODO add error log why it failed.
-    # Generate the page image and store it in the page object
+        return conv_res
    def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
        # default scale
        page.get_image(
            scale=1.0
        )  # puts the page image on the image cache at default scale
        # user requested scales
        if self.assemble_options.images_scale is not None:
            page._default_image_scale = self.assemble_options.images_scale
            page.get_image(
                scale=self.assemble_options.images_scale
            )  # this will trigger storing the image in the internal cache
        return page
    # Extract and populate the page cells and store it in the page object
    def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
        page.cells = page._backend.get_text_cells()
        # DEBUG code:
        def draw_text_boxes(image, cells):
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.bbox.as_tuple()
                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
            image.show()
        # draw_text_boxes(page.get_image(scale=1.0), cells)
        return page
    def _assemble_doc(self, conv_res: ConversionResult):
        all_elements = []
        all_headers = []
        all_body = []
        for p in conv_res.pages:
            for el in p.assembled.body:
                all_body.append(el)
            for el in p.assembled.headers:
                all_headers.append(el)
            for el in p.assembled.elements:
                all_elements.append(el)
        conv_res.assembled = AssembledUnit(
            elements=all_elements, headers=all_headers, body=all_body
        )
        conv_res.output, conv_res.experimental = self.glm_model(conv_res)
--- a/docling/models/abstract_model.py
+++ b/docling/models/abstract_model.py
@ -0,0 +1,10 @@
 from abc import ABC, abstractmethod
 from typing import Iterable
 from docling.datamodel.base_models import Page
 class AbstractPageModel(ABC):
    @abstractmethod
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        pass
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -3,7 +3,6 @@ import logging
 from abc import abstractmethod
 from typing import Iterable, List, Tuple
 import numpy
 import numpy as np
 from docling_core.types.experimental import BoundingBox, CoordOrigin
 from PIL import Image, ImageDraw
@ -11,11 +10,12 @@ from rtree import index
 from scipy.ndimage import find_objects, label
 from docling.datamodel.base_models import OcrCell, Page
 from docling.models.abstract_model import AbstractPageModel
 _log = logging.getLogger(__name__)
-class BaseOcrModel:
+class BaseOcrModel(AbstractPageModel):
    def __init__(self, config):
        self.config = config
        self.enabled = config["enabled"]
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -16,12 +16,13 @@ from docling.datamodel.base_models import (
    LayoutPrediction,
    Page,
 )
 from docling.models.abstract_model import AbstractPageModel
 from docling.utils import layout_utils as lu
 _log = logging.getLogger(__name__)
-class LayoutModel:
+class LayoutModel(AbstractPageModel):
    TEXT_ELEM_LABELS = [
        DocItemLabel.TEXT,
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@ -10,12 +10,13 @@ from docling.datamodel.base_models import (
    Table,
    TextElement,
 )
 from docling.models.abstract_model import AbstractPageModel
 from docling.models.layout_model import LayoutModel
 _log = logging.getLogger(__name__)
-class PageAssembleModel:
+class PageAssembleModel(AbstractPageModel):
    def __init__(self, config):
        self.config = config
@ -145,4 +146,11 @@ class PageAssembleModel:
                elements=elements, headers=headers, body=body
            )
            # Remove page images (can be disabled)
            if self.config["images_scale"] is None:
                page._image_cache = {}
            # Unload backend
            page._backend.unload()
            yield page
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -0,0 +1,50 @@
 from typing import Iterable
 from PIL import ImageDraw
 from docling.datamodel.base_models import Page
 from docling.models.abstract_model import AbstractPageModel
 class PagePreprocessingModel(AbstractPageModel):
    def __init__(self, config):
        self.config = config
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for page in page_batch:
            page = self._populate_page_images(page)
            page = self._parse_page_cells(page)
            yield page
    # Generate the page image and store it in the page object
    def _populate_page_images(self, page: Page) -> Page:
        # default scale
        page.get_image(
            scale=1.0
        )  # puts the page image on the image cache at default scale
        images_scale = self.config["images_scale"]
        # user requested scales
        if images_scale is not None:
            page._default_image_scale = images_scale
            page.get_image(
                scale=images_scale
            )  # this will trigger storing the image in the internal cache
        return page
    # Extract and populate the page cells and store it in the page object
    def _parse_page_cells(self, page: Page) -> Page:
        page.cells = page._backend.get_text_cells()
        # DEBUG code:
        def draw_text_boxes(image, cells):
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.bbox.as_tuple()
                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
            image.show()
        # draw_text_boxes(page.get_image(scale=1.0), cells)
        return page
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -9,9 +9,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
 from PIL import ImageDraw
 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
 from docling.models.abstract_model import AbstractPageModel
-class TableStructureModel:
+class TableStructureModel(AbstractPageModel):
    def __init__(self, config):
        self.config = config
        self.do_cell_matching = config["do_cell_matching"]
--- a/docling/pipeline/base_model_pipeline.py
+++ b/docling/pipeline/base_model_pipeline.py
@ -1,17 +1,117 @@
-from pathlib import Path
+import functools
 import logging
 import time
 import traceback
 from abc import ABC, abstractmethod
 from typing import Callable, Iterable, List
-from docling.datamodel.base_models import Page, PipelineOptions
+from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.datamodel.base_models import (
    ConversionStatus,
    DoclingComponentType,
    ErrorItem,
    Page,
    PipelineOptions,
 )
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.settings import settings
 from docling.utils.utils import chunkify
 _log = logging.getLogger(__name__)
-class BaseModelPipeline:
+class BaseModelPipeline(ABC):
-    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
+    def __init__(self, pipeline_options: PipelineOptions):
        self.model_pipe: List[Callable] = []
        self.artifacts_path = artifacts_path
        self.pipeline_options = pipeline_options
        self.model_pipe: List[Callable] = []
-    def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    @abstractmethod
    def execute(self, in_doc: InputDocument) -> ConversionResult:
        pass
    @abstractmethod
    def assemble_document(
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionResult:
        pass
    @classmethod
    @abstractmethod
    def get_default_options(cls) -> PipelineOptions:
        pass
    @classmethod
    @abstractmethod
    def is_backend_supported(cls, backend: AbstractDocumentBackend):
        pass
 class PaginatedModelPipeline(BaseModelPipeline):
    def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for model in self.model_pipe:
            page_batch = model(page_batch)
        yield from page_batch
    def execute(self, in_doc: InputDocument) -> ConversionResult:
        conv_res = ConversionResult(input=in_doc)
        _log.info(f"Processing document {in_doc.file.name}")
        for i in range(0, in_doc.page_count):
            conv_res.pages.append(Page(page_no=i))
        try:
            # Iterate batches of pages (page_batch_size) in the doc
            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
                start_pb_time = time.time()
                # 1. Initialise the page resources
                init_pages = map(
                    functools.partial(self.initialize_page, in_doc), page_batch
                )
                # 2. Run pipeline stages
                pipeline_pages = self.apply_on_pages(init_pages)
                for p in pipeline_pages:
                    pass
                end_pb_time = time.time() - start_pb_time
                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
            # Free up mem resources of PDF backend
            in_doc._backend.unload()
            conv_res = self.assemble_document(in_doc, conv_res)
            status = ConversionStatus.SUCCESS
            for page in conv_res.pages:
                if not page._backend.is_valid():
                    conv_res.errors.append(
                        ErrorItem(
                            component_type=DoclingComponentType.DOCUMENT_BACKEND,
                            module_name=type(page._backend).__name__,
                            error_message=f"Page {page.page_no} failed to parse.",
                        )
                    )
                    status = ConversionStatus.PARTIAL_SUCCESS
            conv_res.status = status
        except Exception as e:
            conv_res.status = ConversionStatus.FAILURE
            trace = "\n".join(traceback.format_exception(e))
            _log.info(
                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
                f"{trace}"
            )
            raise e
        return conv_res
    # Initialise and load resources for a page
    @abstractmethod
    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
        pass
--- a/docling/pipeline/simple_model_pipeline.py
+++ b/docling/pipeline/simple_model_pipeline.py
@ -0,0 +1,57 @@
 import logging
 from docling.backend.abstract_backend import (
    AbstractDocumentBackend,
    DeclarativeDocumentBackend,
 )
 from docling.datamodel.base_models import (
    ConversionStatus,
    PdfPipelineOptions,
    PipelineOptions,
 )
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
 _log = logging.getLogger(__name__)
 class SimpleModelPipeline(BaseModelPipeline):
    def __init__(self, pipeline_options: PdfPipelineOptions):
        super().__init__(pipeline_options)
    def execute(self, in_doc: InputDocument) -> ConversionResult:
        conv_res = ConversionResult(input=in_doc)
        _log.info(f"Processing document {in_doc.file.name}")
        if not in_doc.valid:
            conv_res.status = ConversionStatus.FAILURE
            return conv_res
        if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
            conv_res.status = ConversionStatus.FAILURE
            return conv_res
        conv_res.experimental = in_doc._backend.convert()
        # Do other stuff with conv_res.experimental
        conv_res = self.assemble_document(in_doc, conv_res)
        conv_res.status = ConversionStatus.SUCCESS
        return conv_res
    def assemble_document(
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionResult:
        return conv_res
    @classmethod
    def get_default_options(cls) -> PipelineOptions:
        return PipelineOptions()
    @classmethod
    def is_backend_supported(cls, backend: AbstractDocumentBackend):
        return isinstance(backend, DeclarativeDocumentBackend)
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@ -1,38 +0,0 @@
 from pathlib import Path
 from docling.datamodel.base_models import PipelineOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
 class StandardModelPipeline(BaseModelPipeline):
    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
    _table_model_path = "model_artifacts/tableformer"
    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
        super().__init__(artifacts_path, pipeline_options)
        self.model_pipe = [
            EasyOcrModel(
                config={
                    "lang": ["fr", "de", "es", "en"],
                    "enabled": pipeline_options.do_ocr,
                }
            ),
            LayoutModel(
                config={
                    "artifacts_path": artifacts_path
                    / StandardModelPipeline._layout_model_path
                }
            ),
            TableStructureModel(
                config={
                    "artifacts_path": artifacts_path
                    / StandardModelPipeline._table_model_path,
                    "enabled": pipeline_options.do_table_structure,
                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
                }
            ),
        ]
--- a/docling/pipeline/standard_pdf_model_pipeline.py
+++ b/docling/pipeline/standard_pdf_model_pipeline.py
@ -0,0 +1,108 @@
 import logging
 from pathlib import Path
 from typing import Optional
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import AssembledUnit, Page, PdfPipelineOptions
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.models.ds_glm_model import GlmModel
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.page_assemble_model import PageAssembleModel
 from docling.models.page_preprocessing_model import PagePreprocessingModel
 from docling.models.table_structure_model import TableStructureModel
 from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
 _log = logging.getLogger(__name__)
 class StandardPdfModelPipeline(PaginatedModelPipeline):
    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
    _table_model_path = "model_artifacts/tableformer"
    def __init__(self, pipeline_options: PdfPipelineOptions):
        super().__init__(pipeline_options)
        if not pipeline_options.artifacts_path:
            artifacts_path = self.download_models_hf()
        self.artifacts_path = Path(artifacts_path)
        self.glm_model = GlmModel(config={})
        self.model_pipe = [
            PagePreprocessingModel(
                config={"images_scale": pipeline_options.images_scale}
            ),
            EasyOcrModel(
                config={
                    "lang": ["fr", "de", "es", "en"],
                    "enabled": pipeline_options.do_ocr,
                }
            ),
            LayoutModel(
                config={
                    "artifacts_path": artifacts_path
                    / StandardPdfModelPipeline._layout_model_path
                }
            ),
            TableStructureModel(
                config={
                    "artifacts_path": artifacts_path
                    / StandardPdfModelPipeline._table_model_path,
                    "enabled": pipeline_options.do_table_structure,
                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
                }
            ),
            PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
        ]
    @staticmethod
    def download_models_hf(
        local_dir: Optional[Path] = None, force: bool = False
    ) -> Path:
        from huggingface_hub import snapshot_download
        download_path = snapshot_download(
            repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
        )
        return Path(download_path)
    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
        page._backend = doc._backend.load_page(page.page_no)
        page.size = page._backend.get_size()
        return page
    def assemble_document(
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionResult:
        all_elements = []
        all_headers = []
        all_body = []
        for p in conv_res.pages:
            for el in p.assembled.body:
                all_body.append(el)
            for el in p.assembled.headers:
                all_headers.append(el)
            for el in p.assembled.elements:
                all_elements.append(el)
        conv_res.assembled = AssembledUnit(
            elements=all_elements, headers=all_headers, body=all_body
        )
        conv_res.output, conv_res.experimental = self.glm_model(conv_res)
        return conv_res
    @classmethod
    def get_default_options(cls) -> PdfPipelineOptions:
        return PdfPipelineOptions()
    @classmethod
    def is_backend_supported(cls, backend: AbstractDocumentBackend):
        return isinstance(backend, PdfDocumentBackend)
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@ -6,9 +6,9 @@ from typing import Iterable
 import yaml
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 _log = logging.getLogger(__name__)
@ -107,7 +107,11 @@ def main():
    # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
    # input = DocumentConversionInput.from_streams(docs)
-    doc_converter = DocumentConverter()
+    doc_converter = PdfDocumentConverter(
        pipeline_options=PdfPipelineOptions(),
        pdf_backend=DocumentConversionInput.DEFAULT_BACKEND,
        pipeline_cls=StandardModelPipeline,
    )
    input = DocumentConversionInput.from_paths(input_doc_paths)
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@ -6,9 +6,9 @@ from typing import Iterable
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 _log = logging.getLogger(__name__)
@ -93,12 +93,12 @@ def main():
    # Docling Parse without OCR
    # -------------------------
-    pipeline_options = PipelineOptions()
+    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
-    doc_converter = DocumentConverter(
+    doc_converter = PdfDocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
--- a/examples/export_figures.py
+++ b/examples/export_figures.py
@ -4,14 +4,14 @@ from pathlib import Path
 from typing import Tuple
 from docling.datamodel.base_models import (
    AssembleOptions,
    ConversionStatus,
    FigureElement,
    PageElement,
    PdfPipelineOptions,
    Table,
 )
 from docling.datamodel.document import DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 _log = logging.getLogger(__name__)
@ -30,12 +30,12 @@ def main():
    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
-    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
+    # This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
-    assemble_options = AssembleOptions()
+    pipeline_options = PdfPipelineOptions()
-    assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
+    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
-    doc_converter = DocumentConverter(assemble_options=assemble_options)
+    doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
    start_time = time.time()
--- a/examples/export_multimodal.py
+++ b/examples/export_multimodal.py
@ -5,9 +5,9 @@ from pathlib import Path
 import pandas as pd
-from docling.datamodel.base_models import AssembleOptions, ConversionStatus
+from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
 from docling.datamodel.document import DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 from docling.utils.export import generate_multimodal_pages
 _log = logging.getLogger(__name__)
@ -27,12 +27,12 @@ def main():
    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
-    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
+    # This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
-    assemble_options = AssembleOptions()
+    pipeline_options = PdfPipelineOptions()
-    assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
+    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
-    doc_converter = DocumentConverter(assemble_options=assemble_options)
+    doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
    start_time = time.time()
--- a/examples/export_tables.py
+++ b/examples/export_tables.py
@ -7,7 +7,7 @@ import pandas as pd
 from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 _log = logging.getLogger(__name__)
@ -22,7 +22,7 @@ def main():
    input_files = DocumentConversionInput.from_paths(input_doc_paths)
-    doc_converter = DocumentConverter()
+    doc_converter = PdfDocumentConverter()
    start_time = time.time()
--- a/examples/minimal.py
+++ b/examples/minimal.py
@ -1,6 +1,6 @@
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
-converter = DocumentConverter()
+converter = PdfDocumentConverter()
 doc = converter.convert_single(source)
 print(doc.render_as_markdown())  # output: ## Docling Technical Report [...]"
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@ -0,0 +1,41 @@
 from pathlib import Path
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
    InputFormat,
    PdfPipelineOptions,
    PipelineOptions,
 )
 from docling.datamodel.document import DocumentConversionInput
 from docling.document_converter import DocumentConverter, FormatOption
 from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
 from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
 input_paths = [
    Path("tests/data/wiki_duck.html"),
    Path("tests/data/word_sample.docx"),
    Path("tests/data/powerpoint_sample.pptx"),
    Path("tests/data/2206.01062.pdf"),
 ]
 input = DocumentConversionInput.from_paths(input_paths)
 # for defaults use:
 doc_converter = DocumentConverter()
 # to customize use:
 # doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
 #     formats=[InputFormat.PDF, InputFormat.DOCX],
 #     format_options={
 #         InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
 #         InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
 #     }
 # )
 conv_results = doc_converter.convert(input)
 for res in conv_results:
    print(
        f"Document {res.input.file.name} converted with status {res.status}. Content:"
    )
    print(res.experimental.export_to_markdown())
--- a/tests/data/powerpoint_sample.pptx
+++ b/tests/data/powerpoint_sample.pptx
--- a/tests/data/wiki_duck.html
+++ b/tests/data/wiki_duck.html
--- a/tests/data/word_sample.docx
+++ b/tests/data/word_sample.docx
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -2,9 +2,9 @@ from pathlib import Path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import PipelineOptions
+from docling.datamodel.base_models import PdfPipelineOptions
 from docling.datamodel.document import ConversionResult
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 from .verify_utils import verify_conversion_result
@ -23,12 +23,12 @@ def get_pdf_paths():
 def get_converter():
-    pipeline_options = PipelineOptions()
+    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
-    converter = DocumentConverter(
+    converter = PdfDocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@ -5,9 +5,9 @@ import pytest
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import DocumentStream, PipelineOptions
+from docling.datamodel.base_models import DocumentStream, PdfPipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 from .verify_utils import verify_conversion_result
@ -21,12 +21,12 @@ def get_pdf_path():
@pytest.fixture
 def converter():
-    pipeline_options = PipelineOptions()
+    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
-    converter = DocumentConverter(
+    converter = PdfDocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
@ -34,7 +34,7 @@ def converter():
    return converter
-def test_convert_single(converter: DocumentConverter):
+def test_convert_single(converter: PdfDocumentConverter):
    pdf_path = get_pdf_path()
    print(f"converting {pdf_path}")
@ -43,7 +43,7 @@ def test_convert_single(converter: DocumentConverter):
    verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
-def test_batch_path(converter: DocumentConverter):
+def test_batch_path(converter: PdfDocumentConverter):
    pdf_path = get_pdf_path()
    print(f"converting {pdf_path}")
@ -55,7 +55,7 @@ def test_batch_path(converter: DocumentConverter):
        verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
-def test_batch_bytes(converter: DocumentConverter):
+def test_batch_bytes(converter: PdfDocumentConverter):
    pdf_path = get_pdf_path()
    print(f"converting {pdf_path}")