Fundamental refactoring for multi-format support

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2024-10-01 16:27:22 +02:00 · 2024-10-01 16:27:22 +02:00 · 1fa7cd9855
commit 1fa7cd9855
parent cd06d89c2a
34 changed files with 2102 additions and 365 deletions
--- a/README.md
+++ b/README.md
@ -67,11 +67,12 @@ pip install docling
 ### Convert a single document

 To convert invidual PDF documents, use `convert_single()`, for example:
+
 ```python
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter

 source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
-converter = DocumentConverter()
+converter = PdfDocumentConverter()
 result = converter.convert_single(source)
 print(result.render_as_markdown())  # output: "## Docling Technical Report[...]"
 print(result.render_as_doctags())  # output: "<document><title><page_1><loc_20>..."
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@ -1,13 +1,11 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
+from typing import Set, Union

-from docling_core.types.experimental import BoundingBox, Size
-from PIL import Image
+from docling_core.types.experimental import DoclingDocument

-if TYPE_CHECKING:
-    from docling.datamodel.base_models import Cell
+from docling.datamodel.base_models import InputFormat


 class AbstractDocumentBackend(ABC):
@ -20,6 +18,11 @@ class AbstractDocumentBackend(ABC):
    def is_valid(self) -> bool:
        pass

+    @classmethod
+    @abstractmethod
+    def is_paginated(cls) -> bool:
+        pass
+
    @abstractmethod
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
@ -27,45 +30,19 @@ class AbstractDocumentBackend(ABC):

        self.path_or_stream = None

-
-class PdfPageBackend(ABC):
-
+    @classmethod
    @abstractmethod
-    def get_text_in_rect(self, bbox: "BoundingBox") -> str:
-        pass
-
-    @abstractmethod
-    def get_text_cells(self) -> Iterable["Cell"]:
-        pass
-
-    @abstractmethod
-    def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
-        pass
-
-    @abstractmethod
-    def get_page_image(
-        self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
-    ) -> Image.Image:
-        pass
-
-    @abstractmethod
-    def get_size(self) -> "Size":
-        pass
-
-    @abstractmethod
-    def is_valid(self) -> bool:
-        pass
-
-    @abstractmethod
-    def unload(self):
+    def supported_formats(cls) -> Set[InputFormat]:
        pass


-class PdfDocumentBackend(AbstractDocumentBackend):
-    @abstractmethod
-    def load_page(self, page_no: int) -> PdfPageBackend:
-        pass
+class DeclarativeDocumentBackend(AbstractDocumentBackend):
+    """DeclarativeDocumentBackend.
+
+    A declarative document backend is a backend that can transform to DoclingDocument
+    straight without a recognition pipeline.
+    """

    @abstractmethod
-    def page_count(self) -> int:
+    def convert(self) -> DoclingDocument:
        pass
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -10,7 +10,7 @@ from docling_parse.docling_parse import pdf_parser
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import Cell

 _log = logging.getLogger(__name__)
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -0,0 +1,40 @@
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling_core.types.experimental import (
+    DescriptionItem,
+    DocItemLabel,
+    DoclingDocument,
+)
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+
+
+class HTMLDocumentBackend(DeclarativeDocumentBackend):
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
+
+    def is_valid(self) -> bool:
+        return True
+
+    def is_paginated(cls) -> bool:
+        False
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.HTML}
+
+    def convert(self) -> DoclingDocument:
+
+        # access self.path_or_stream to load stuff
+        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
+        doc.add_text(text="I am a HTML document.", label=DocItemLabel.TEXT)
+        return doc
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -0,0 +1,38 @@
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling_core.types.experimental import (
+    DescriptionItem,
+    DocItemLabel,
+    DoclingDocument,
+)
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+
+
+class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
+
+    def is_valid(self) -> bool:
+        return True
+
+    def is_paginated(cls) -> bool:
+        False
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.PPTX}
+
+    def convert(self) -> DoclingDocument:
+        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
+        doc.add_text(text="I am a Powerpoint document.", label=DocItemLabel.TEXT)
+        return doc
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -0,0 +1,38 @@
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling_core.types.experimental import (
+    DescriptionItem,
+    DocItemLabel,
+    DoclingDocument,
+)
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+
+
+class MsWordDocumentBackend(DeclarativeDocumentBackend):
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
+
+    def is_valid(self) -> bool:
+        return True
+
+    def is_paginated(cls) -> bool:
+        False
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.DOCX}
+
+    def convert(self) -> DoclingDocument:
+        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
+        doc.add_text(text="I am a Word document.", label=DocItemLabel.TEXT)
+        return doc
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@ -0,0 +1,59 @@
+from abc import ABC, abstractmethod
+from typing import Iterable, Optional, Set
+
+from docling_core.types.experimental import BoundingBox, Size
+from PIL import Image
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.base_models import Cell, InputFormat
+
+
+class PdfPageBackend(ABC):
+
+    @abstractmethod
+    def get_text_in_rect(self, bbox: "BoundingBox") -> str:
+        pass
+
+    @abstractmethod
+    def get_text_cells(self) -> Iterable["Cell"]:
+        pass
+
+    @abstractmethod
+    def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
+        pass
+
+    @abstractmethod
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
+    ) -> Image.Image:
+        pass
+
+    @abstractmethod
+    def get_size(self) -> "Size":
+        pass
+
+    @abstractmethod
+    def is_valid(self) -> bool:
+        pass
+
+    @abstractmethod
+    def unload(self):
+        pass
+
+
+class PdfDocumentBackend(AbstractDocumentBackend):
+    @abstractmethod
+    def load_page(self, page_no: int) -> PdfPageBackend:
+        pass
+
+    @abstractmethod
+    def page_count(self) -> int:
+        pass
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.PDF}
+
+    @classmethod
+    def is_paginated(cls) -> bool:
+        return True
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -8,10 +8,10 @@ import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
 from PIL import Image, ImageDraw
-from pypdfium2 import PdfPage, PdfTextPage
+from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError

-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import Cell

 _log = logging.getLogger(__name__)
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -12,9 +12,9 @@ from docling_core.utils.file import resolve_file_source

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -190,12 +190,12 @@ def convert(
        case _:
            raise RuntimeError(f"Unexpected backend type {backend}")

-    pipeline_options = PipelineOptions(
+    pipeline_options = PdfPipelineOptions(
        do_ocr=ocr,
        do_table_structure=True,
    )
    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
-    doc_converter = DocumentConverter(
+    doc_converter = PdfDocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=pdf_backend,
    )
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -1,8 +1,8 @@
-import copy
 import warnings
 from enum import Enum, auto
 from io import BytesIO
-from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import Annotated, Dict, List, Optional, Union

 from docling_core.types.experimental import BoundingBox, Size
 from docling_core.types.experimental.document import BasePictureData, TableCell
@ -11,8 +11,6 @@ from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Self

-from docling.backend.abstract_backend import PdfPageBackend
-

 class ConversionStatus(str, Enum):
    PENDING = auto()
@ -30,13 +28,29 @@ class InputFormat(str, Enum):
    PDF = auto()


+FormatToMimeType = {
+    InputFormat.DOCX: {
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    },
+    InputFormat.PPTX: {
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+    },
+    InputFormat.HTML: {"text/html", "application/xhtml+xml"},
+    InputFormat.IMAGE: {"image/png", "image/jpeg"},
+    InputFormat.PDF: {"application/pdf"},
+}
+MimeTypeToFormat = {
+    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
+}
+
+
 class DocInputType(str, Enum):
    PATH = auto()
    STREAM = auto()


 class DoclingComponentType(str, Enum):
-    PDF_BACKEND = auto()
+    DOCUMENT_BACKEND = auto()
    MODEL = auto()
    DOC_ASSEMBLER = auto()

@ -128,13 +142,13 @@ class Page(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    page_no: int
-    page_hash: Optional[str] = None
+    # page_hash: Optional[str] = None
    size: Optional[Size] = None
    cells: List[Cell] = []
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None

-    _backend: Optional[PdfPageBackend] = (
+    _backend: Optional["PdfPageBackend"] = (
        None  # Internal PDF backend. By default it is cleared during assembling.
    )
    _default_image_scale: float = 1.0  # Default image scale for external usage.
@ -170,14 +184,16 @@ class TableStructureOptions(BaseModel):
    )


-class PipelineOptions(BaseModel):
+class PipelineOptions(BaseModel): ...
+
+
+class PdfPipelineOptions(PipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text

    table_structure_options: TableStructureOptions = TableStructureOptions()

-
-class AssembleOptions(BaseModel):
    keep_page_images: Annotated[
        bool,
        Field(
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -1,9 +1,10 @@
 import logging
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
+from typing import Dict, Iterable, List, Optional, Tuple, Type, Union

-from docling_core.types import BaseCell, BaseText
+import filetype
+from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
@ -19,8 +20,11 @@ from docling_core.types.experimental import (
 from pydantic import BaseModel
 from typing_extensions import deprecated

-from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
+from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
+from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import (
    AssembledUnit,
    ConversionStatus,
@ -28,13 +32,14 @@ from docling.datamodel.base_models import (
    ErrorItem,
    FigureElement,
    InputFormat,
+    MimeTypeToFormat,
    Page,
    PageElement,
    Table,
    TextElement,
 )
 from docling.datamodel.settings import DocumentLimits
-from docling.utils.utils import create_file_hash
+from docling.utils.utils import create_file_hash, create_hash

 _log = logging.getLogger(__name__)

@ -71,8 +76,9 @@ _EMPTY_DOCLING_DOC = DoclingDocument(

 _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
    InputFormat.PDF: DoclingParseDocumentBackend,
-    InputFormat.DOCX: None,
-    InputFormat.PPTX: None,
+    InputFormat.HTML: HTMLDocumentBackend,
+    InputFormat.DOCX: MsWordDocumentBackend,
+    InputFormat.PPTX: MsPowerpointDocumentBackend,
    InputFormat.IMAGE: None,
 }

@ -80,13 +86,14 @@ _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]]
 class InputDocument(BaseModel):
    file: PurePath = None
    document_hash: Optional[str] = None
-    valid: bool = False
+    valid: bool = True
    limits: DocumentLimits = DocumentLimits()
+    format: Optional[InputFormat] = None

    filesize: Optional[int] = None
-    page_count: Optional[int] = None
+    page_count: int = 0

-    _backend: PdfDocumentBackend = None  # Internal PDF backend used
+    _backend: AbstractDocumentBackend = None  # Internal PDF backend used

    def __init__(
        self,
@ -94,27 +101,31 @@ class InputDocument(BaseModel):
        filename: Optional[str] = None,
        limits: Optional[DocumentLimits] = None,
        backend: Optional[Type[AbstractDocumentBackend]] = None,
+        format: Optional[InputFormat] = None,
    ):
        super().__init__()

-        if not backend:
-            backend = _input_format_default_backends[InputFormat.PDF]
-
        self.limits = limits or DocumentLimits()

        try:
            if isinstance(path_or_stream, Path):
+                mime = filetype.guess_mime(str(path_or_stream))
+                if mime is None:
+                    if path_or_stream.suffix == ".html":
+                        mime = "text/html"
+
                self.file = path_or_stream
                self.filesize = path_or_stream.stat().st_size
                if self.filesize > self.limits.max_file_size:
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = backend(
-                        path_or_stream=path_or_stream, document_hash=self.document_hash
-                    )
+
+                    self._init_doc(backend, mime, path_or_stream)

            elif isinstance(path_or_stream, BytesIO):
+                mime = filetype.guess_mime(path_or_stream.read(8192))
+
                self.file = PurePath(filename)
                self.filesize = path_or_stream.getbuffer().nbytes

@ -122,15 +133,15 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = backend(
-                        path_or_stream=path_or_stream, document_hash=self.document_hash
-                    )

-            if self.document_hash and self._backend.page_count() > 0:
+                    self._init_doc(backend, mime, path_or_stream)
+
+            # For paginated backends, check if the maximum page count is exceeded.
+            if self.valid and self._backend.is_valid():
+                if self._backend.is_paginated():
                    self.page_count = self._backend.page_count()
-
-                if self.page_count <= self.limits.max_num_pages:
-                    self.valid = True
+                    if not self.page_count <= self.limits.max_num_pages:
+                        self.valid = False

        except (FileNotFoundError, OSError) as e:
            _log.exception(
@ -144,6 +155,27 @@ class InputDocument(BaseModel):
            )
            # raise

+    def _init_doc(
+        self,
+        backend: AbstractDocumentBackend,
+        mime: str,
+        path_or_stream: Union[BytesIO, Path],
+    ) -> None:
+        self.format = MimeTypeToFormat.get(mime)
+        if self.format is not None:
+            backend = backend or _input_format_default_backends.get(self.format)
+            if backend is None:
+                raise RuntimeError(
+                    f"Could not find suitable default backend for format: {self.format}"
+                )
+        if self.format is None or self.format not in backend.supported_formats():
+            # TODO decide if to raise exception here too.
+            self.valid = False
+        else:
+            self._backend = backend(
+                path_or_stream=path_or_stream, document_hash=self.document_hash
+            )
+

@deprecated("Use `ConversionResult` instead.")
 class ConvertedDocument(BaseModel):
@ -163,7 +195,11 @@ class ConvertedDocument(BaseModel):
        desc = DsDocumentDescription(logs=[])

        page_hashes = [
-            PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
+            PageReference(
+                hash=create_hash(self.input.document_hash + ":" + str(p.page_no)),
+                page=p.page_no + 1,
+                model="default",
+            )
            for p in self.pages
        ]

@ -441,25 +477,21 @@ class DocumentConversionInput(BaseModel):
    _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
    limits: Optional[DocumentLimits] = DocumentLimits()

-    DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
-
    def docs(
-        self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
+        self, backend: Optional[Type[AbstractDocumentBackend]] = None
    ) -> Iterable[InputDocument]:

-        pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
-
        for obj in self._path_or_stream_iterator:
            if isinstance(obj, Path):
                yield InputDocument(
-                    path_or_stream=obj, limits=self.limits, backend=pdf_backend
+                    path_or_stream=obj, limits=self.limits, backend=backend
                )
            elif isinstance(obj, DocumentStream):
                yield InputDocument(
                    path_or_stream=obj.stream,
                    filename=obj.filename,
                    limits=self.limits,
-                    backend=pdf_backend,
+                    backend=backend,
                )

    @classmethod
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -1,81 +1,78 @@
-import functools
 import logging
 import tempfile
 import time
-import traceback
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
+from typing import Dict, Iterable, List, Optional, Type

 import requests
-from PIL import ImageDraw
-from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
+from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError

-from docling.backend.abstract_backend import PdfDocumentBackend
-from docling.datamodel.base_models import (
-    AssembledUnit,
-    AssembleOptions,
-    ConversionStatus,
-    DoclingComponentType,
-    ErrorItem,
-    Page,
-    PipelineOptions,
-)
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, InputFormat, PipelineOptions
 from docling.datamodel.document import (
    ConversionResult,
    DocumentConversionInput,
    InputDocument,
 )
 from docling.datamodel.settings import settings
-from docling.models.ds_glm_model import GlmModel
-from docling.models.page_assemble_model import PageAssembleModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
-from docling.pipeline.standard_model_pipeline import StandardModelPipeline
-from docling.utils.utils import chunkify, create_hash
+from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
+from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
+from docling.utils.utils import chunkify

 _log = logging.getLogger(__name__)


-class DocumentConverter:
-    _default_download_filename = "file.pdf"
+class FormatOption(BaseModel):
+    pipeline_cls: Type[BaseModelPipeline]
+    pipeline_options: Optional[PipelineOptions] = None
+    backend: Optional[Type[AbstractDocumentBackend]]
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)

    def __init__(
        self,
-        artifacts_path: Optional[Union[Path, str]] = None,
-        pipeline_options: PipelineOptions = PipelineOptions(),
-        pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
-        pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
-        assemble_options: AssembleOptions = AssembleOptions(),
+        pipeline_cls: Type[BaseModelPipeline],
+        pipeline_options: Optional[PipelineOptions] = None,
+        backend: Optional[Type[AbstractDocumentBackend]] = None,
    ):
-        if not artifacts_path:
-            artifacts_path = self.download_models_hf()
+        if pipeline_options is None:
+            pipeline_options = pipeline_cls.get_default_options()

-        artifacts_path = Path(artifacts_path)
-
-        self.model_pipeline = pipeline_cls(
-            artifacts_path=artifacts_path, pipeline_options=pipeline_options
+        super().__init__(
+            pipeline_cls=pipeline_cls,
+            pipeline_options=pipeline_options,
+            backend=backend,
        )

-        self.page_assemble_model = PageAssembleModel(config={})
-        self.glm_model = GlmModel(config={})
-        self.pdf_backend = pdf_backend
-        self.assemble_options = assemble_options

-    @staticmethod
-    def download_models_hf(
-        local_dir: Optional[Path] = None, force: bool = False
-    ) -> Path:
-        from huggingface_hub import snapshot_download
+_format_to_default_options = {
+    InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
+    InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
+    InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
+    InputFormat.IMAGE: None,
+    InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
+}

-        download_path = snapshot_download(
-            repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
+
+class DocumentConverter:
+    _default_download_filename = "file"
+
+    def __init__(
+        self,
+        formats: List[InputFormat] = [e for e in InputFormat],
+        format_options: Dict[InputFormat, FormatOption] = _format_to_default_options,
+    ):
+        self.formats = formats
+        self.format_to_options = format_options
+        self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
+            {}
        )

-        return Path(download_path)
-
    def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:

        for input_batch in chunkify(
-            input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
+            input.docs(), settings.perf.doc_batch_size  # pass format_options
        ):
            _log.info(f"Going to convert document batch...")
            # parallel processing only within input_batch
@ -84,8 +81,8 @@ class DocumentConverter:
            # ) as pool:
            #   yield from pool.map(self.process_document, input_batch)

-            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
-            yield from map(self._process_document, input_batch)
+            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
+            yield from map(self.process_document, input_batch)

    def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
        """Convert a single document.
@ -137,156 +134,42 @@ class DocumentConverter:
            raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
        return conv_res

-    def _process_document(self, in_doc: InputDocument) -> ConversionResult:
+    def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
+        pipeline_class = None
+        fopt = self.format_to_options.get(doc.format)
+        if fopt is None:
+            return None
+        else:
+            pipeline_class = fopt.pipeline_cls
+
+        if pipeline_class not in self.initialized_pipelines:
+            self.initialized_pipelines[pipeline_class] = pipeline_class(
+                pipeline_options=pipeline_class.get_default_options()
+            )
+        return self.initialized_pipelines[pipeline_class]
+
+    def process_document(self, in_doc: InputDocument) -> ConversionResult:
        start_doc_time = time.time()
-        conv_res = ConversionResult(input=in_doc)

-        _log.info(f"Processing document {in_doc.file.name}")
-
-        if not in_doc.valid:
-            conv_res.status = ConversionStatus.FAILURE
-            return conv_res
-
-        for i in range(0, in_doc.page_count):
-            conv_res.pages.append(Page(page_no=i))
-
-        all_assembled_pages = []
-
-        try:
-            # Iterate batches of pages (page_batch_size) in the doc
-            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
-                start_pb_time = time.time()
-                # Pipeline
-
-                # 1. Initialise the page resources
-                init_pages = map(
-                    functools.partial(self._initialize_page, in_doc), page_batch
-                )
-
-                # 2. Populate page image
-                pages_with_images = map(
-                    functools.partial(self._populate_page_images, in_doc), init_pages
-                )
-
-                # 3. Populate programmatic page cells
-                pages_with_cells = map(
-                    functools.partial(self._parse_page_cells, in_doc),
-                    pages_with_images,
-                )
-
-                # 4. Run pipeline stages
-                pipeline_pages = self.model_pipeline.apply(pages_with_cells)
-
-                # 5. Assemble page elements (per page)
-                assembled_pages = self.page_assemble_model(pipeline_pages)
-
-                # exhaust assembled_pages
-                for assembled_page in assembled_pages:
-                    # Free up mem resources before moving on with next batch
-
-                    # Remove page images (can be disabled)
-                    if self.assemble_options.images_scale is None:
-                        assembled_page._image_cache = {}
-
-                    # Unload backend
-                    assembled_page._backend.unload()
-
-                    all_assembled_pages.append(assembled_page)
-
-                end_pb_time = time.time() - start_pb_time
-                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
-
-            # Free up mem resources of PDF backend
-            in_doc._backend.unload()
-
-            conv_res.pages = all_assembled_pages
-            self._assemble_doc(conv_res)
-
-            status = ConversionStatus.SUCCESS
-            for page in conv_res.pages:
-                if not page._backend.is_valid():
-                    conv_res.errors.append(
-                        ErrorItem(
-                            component_type=DoclingComponentType.PDF_BACKEND,
-                            module_name=type(page._backend).__name__,
-                            error_message=f"Page {page.page_no} failed to parse.",
-                        )
-                    )
-                    status = ConversionStatus.PARTIAL_SUCCESS
-
-            conv_res.status = status
-
-        except Exception as e:
-            conv_res.status = ConversionStatus.FAILURE
-            trace = "\n".join(traceback.format_exception(e))
-            _log.info(
-                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
-                f"{trace}"
-            )
+        conv_res = self._execute_pipeline(in_doc)

        end_doc_time = time.time() - start_doc_time
-        _log.info(
-            f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
-        )
+        _log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")

        return conv_res

-    # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
-    def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
-        page._backend = doc._backend.load_page(page.page_no)
-        page.size = page._backend.get_size()
-        page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
+    def _execute_pipeline(self, in_doc: InputDocument) -> ConversionResult:
+        if in_doc.valid and in_doc.format in self.formats:
+            pipeline = self._get_pipeline(in_doc)
+            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+                conv_res = ConversionResult(input=in_doc)
+                conv_res.status = ConversionStatus.FAILURE
+                return conv_res

-        return page
+            conv_res = pipeline.execute(in_doc)
+        else:  # invalid doc or not of desired format
+            conv_res = ConversionResult(input=in_doc)
+            conv_res.status = ConversionStatus.FAILURE
+            # TODO add error log why it failed.

-    # Generate the page image and store it in the page object
-    def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
-        # default scale
-        page.get_image(
-            scale=1.0
-        )  # puts the page image on the image cache at default scale
-
-        # user requested scales
-        if self.assemble_options.images_scale is not None:
-            page._default_image_scale = self.assemble_options.images_scale
-            page.get_image(
-                scale=self.assemble_options.images_scale
-            )  # this will trigger storing the image in the internal cache
-
-        return page
-
-    # Extract and populate the page cells and store it in the page object
-    def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
-        page.cells = page._backend.get_text_cells()
-
-        # DEBUG code:
-        def draw_text_boxes(image, cells):
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
-            image.show()
-
-        # draw_text_boxes(page.get_image(scale=1.0), cells)
-
-        return page
-
-    def _assemble_doc(self, conv_res: ConversionResult):
-        all_elements = []
-        all_headers = []
-        all_body = []
-
-        for p in conv_res.pages:
-
-            for el in p.assembled.body:
-                all_body.append(el)
-            for el in p.assembled.headers:
-                all_headers.append(el)
-            for el in p.assembled.elements:
-                all_elements.append(el)
-
-        conv_res.assembled = AssembledUnit(
-            elements=all_elements, headers=all_headers, body=all_body
-        )
-
-        conv_res.output, conv_res.experimental = self.glm_model(conv_res)
+        return conv_res
--- a/docling/models/abstract_model.py
+++ b/docling/models/abstract_model.py
@ -0,0 +1,10 @@
+from abc import ABC, abstractmethod
+from typing import Iterable
+
+from docling.datamodel.base_models import Page
+
+
+class AbstractPageModel(ABC):
+    @abstractmethod
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        pass
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -3,7 +3,6 @@ import logging
 from abc import abstractmethod
 from typing import Iterable, List, Tuple

-import numpy
 import numpy as np
 from docling_core.types.experimental import BoundingBox, CoordOrigin
 from PIL import Image, ImageDraw
@ -11,11 +10,12 @@ from rtree import index
 from scipy.ndimage import find_objects, label

 from docling.datamodel.base_models import OcrCell, Page
+from docling.models.abstract_model import AbstractPageModel

 _log = logging.getLogger(__name__)


-class BaseOcrModel:
+class BaseOcrModel(AbstractPageModel):
    def __init__(self, config):
        self.config = config
        self.enabled = config["enabled"]
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -16,12 +16,13 @@ from docling.datamodel.base_models import (
    LayoutPrediction,
    Page,
 )
+from docling.models.abstract_model import AbstractPageModel
 from docling.utils import layout_utils as lu

 _log = logging.getLogger(__name__)


-class LayoutModel:
+class LayoutModel(AbstractPageModel):

    TEXT_ELEM_LABELS = [
        DocItemLabel.TEXT,
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@ -10,12 +10,13 @@ from docling.datamodel.base_models import (
    Table,
    TextElement,
 )
+from docling.models.abstract_model import AbstractPageModel
 from docling.models.layout_model import LayoutModel

 _log = logging.getLogger(__name__)


-class PageAssembleModel:
+class PageAssembleModel(AbstractPageModel):
    def __init__(self, config):
        self.config = config

@ -145,4 +146,11 @@ class PageAssembleModel:
                elements=elements, headers=headers, body=body
            )

+            # Remove page images (can be disabled)
+            if self.config["images_scale"] is None:
+                page._image_cache = {}
+
+            # Unload backend
+            page._backend.unload()
+
            yield page
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -0,0 +1,50 @@
+from typing import Iterable
+
+from PIL import ImageDraw
+
+from docling.datamodel.base_models import Page
+from docling.models.abstract_model import AbstractPageModel
+
+
+class PagePreprocessingModel(AbstractPageModel):
+    def __init__(self, config):
+        self.config = config
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        for page in page_batch:
+            page = self._populate_page_images(page)
+            page = self._parse_page_cells(page)
+            yield page
+
+    # Generate the page image and store it in the page object
+    def _populate_page_images(self, page: Page) -> Page:
+        # default scale
+        page.get_image(
+            scale=1.0
+        )  # puts the page image on the image cache at default scale
+
+        images_scale = self.config["images_scale"]
+        # user requested scales
+        if images_scale is not None:
+            page._default_image_scale = images_scale
+            page.get_image(
+                scale=images_scale
+            )  # this will trigger storing the image in the internal cache
+
+        return page
+
+    # Extract and populate the page cells and store it in the page object
+    def _parse_page_cells(self, page: Page) -> Page:
+        page.cells = page._backend.get_text_cells()
+
+        # DEBUG code:
+        def draw_text_boxes(image, cells):
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+            image.show()
+
+        # draw_text_boxes(page.get_image(scale=1.0), cells)
+
+        return page
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -9,9 +9,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
 from PIL import ImageDraw

 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
+from docling.models.abstract_model import AbstractPageModel


-class TableStructureModel:
+class TableStructureModel(AbstractPageModel):
    def __init__(self, config):
        self.config = config
        self.do_cell_matching = config["do_cell_matching"]
--- a/docling/pipeline/base_model_pipeline.py
+++ b/docling/pipeline/base_model_pipeline.py
@ -1,17 +1,117 @@
-from pathlib import Path
+import functools
+import logging
+import time
+import traceback
+from abc import ABC, abstractmethod
 from typing import Callable, Iterable, List

-from docling.datamodel.base_models import Page, PipelineOptions
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    DoclingComponentType,
+    ErrorItem,
+    Page,
+    PipelineOptions,
+)
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.settings import settings
+from docling.utils.utils import chunkify
+
+_log = logging.getLogger(__name__)


-class BaseModelPipeline:
-    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
-        self.model_pipe: List[Callable] = []
-        self.artifacts_path = artifacts_path
+class BaseModelPipeline(ABC):
+    def __init__(self, pipeline_options: PipelineOptions):
        self.pipeline_options = pipeline_options
+        self.model_pipe: List[Callable] = []

-    def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    @abstractmethod
+    def execute(self, in_doc: InputDocument) -> ConversionResult:
+        pass
+
+    @abstractmethod
+    def assemble_document(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionResult:
+        pass
+
+    @classmethod
+    @abstractmethod
+    def get_default_options(cls) -> PipelineOptions:
+        pass
+
+    @classmethod
+    @abstractmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        pass
+
+
+class PaginatedModelPipeline(BaseModelPipeline):
+
+    def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for model in self.model_pipe:
            page_batch = model(page_batch)

        yield from page_batch
+
+    def execute(self, in_doc: InputDocument) -> ConversionResult:
+        conv_res = ConversionResult(input=in_doc)
+
+        _log.info(f"Processing document {in_doc.file.name}")
+
+        for i in range(0, in_doc.page_count):
+            conv_res.pages.append(Page(page_no=i))
+
+        try:
+            # Iterate batches of pages (page_batch_size) in the doc
+            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
+                start_pb_time = time.time()
+
+                # 1. Initialise the page resources
+                init_pages = map(
+                    functools.partial(self.initialize_page, in_doc), page_batch
+                )
+
+                # 2. Run pipeline stages
+                pipeline_pages = self.apply_on_pages(init_pages)
+
+                for p in pipeline_pages:
+                    pass
+
+                end_pb_time = time.time() - start_pb_time
+                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
+
+            # Free up mem resources of PDF backend
+            in_doc._backend.unload()
+
+            conv_res = self.assemble_document(in_doc, conv_res)
+
+            status = ConversionStatus.SUCCESS
+            for page in conv_res.pages:
+                if not page._backend.is_valid():
+                    conv_res.errors.append(
+                        ErrorItem(
+                            component_type=DoclingComponentType.DOCUMENT_BACKEND,
+                            module_name=type(page._backend).__name__,
+                            error_message=f"Page {page.page_no} failed to parse.",
+                        )
+                    )
+                    status = ConversionStatus.PARTIAL_SUCCESS
+
+            conv_res.status = status
+
+        except Exception as e:
+            conv_res.status = ConversionStatus.FAILURE
+            trace = "\n".join(traceback.format_exception(e))
+            _log.info(
+                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
+                f"{trace}"
+            )
+            raise e
+
+        return conv_res
+
+    # Initialise and load resources for a page
+    @abstractmethod
+    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+        pass
--- a/docling/pipeline/simple_model_pipeline.py
+++ b/docling/pipeline/simple_model_pipeline.py
@ -0,0 +1,57 @@
+import logging
+
+from docling.backend.abstract_backend import (
+    AbstractDocumentBackend,
+    DeclarativeDocumentBackend,
+)
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    PdfPipelineOptions,
+    PipelineOptions,
+)
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.pipeline.base_model_pipeline import BaseModelPipeline
+
+_log = logging.getLogger(__name__)
+
+
+class SimpleModelPipeline(BaseModelPipeline):
+
+    def __init__(self, pipeline_options: PdfPipelineOptions):
+        super().__init__(pipeline_options)
+
+    def execute(self, in_doc: InputDocument) -> ConversionResult:
+        conv_res = ConversionResult(input=in_doc)
+
+        _log.info(f"Processing document {in_doc.file.name}")
+
+        if not in_doc.valid:
+            conv_res.status = ConversionStatus.FAILURE
+            return conv_res
+
+        if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
+            conv_res.status = ConversionStatus.FAILURE
+            return conv_res
+
+        conv_res.experimental = in_doc._backend.convert()
+
+        # Do other stuff with conv_res.experimental
+
+        conv_res = self.assemble_document(in_doc, conv_res)
+
+        conv_res.status = ConversionStatus.SUCCESS
+
+        return conv_res
+
+    def assemble_document(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionResult:
+        return conv_res
+
+    @classmethod
+    def get_default_options(cls) -> PipelineOptions:
+        return PipelineOptions()
+
+    @classmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        return isinstance(backend, DeclarativeDocumentBackend)
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@ -1,38 +0,0 @@
-from pathlib import Path
-
-from docling.datamodel.base_models import PipelineOptions
-from docling.models.easyocr_model import EasyOcrModel
-from docling.models.layout_model import LayoutModel
-from docling.models.table_structure_model import TableStructureModel
-from docling.pipeline.base_model_pipeline import BaseModelPipeline
-
-
-class StandardModelPipeline(BaseModelPipeline):
-    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
-    _table_model_path = "model_artifacts/tableformer"
-
-    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
-        super().__init__(artifacts_path, pipeline_options)
-
-        self.model_pipe = [
-            EasyOcrModel(
-                config={
-                    "lang": ["fr", "de", "es", "en"],
-                    "enabled": pipeline_options.do_ocr,
-                }
-            ),
-            LayoutModel(
-                config={
-                    "artifacts_path": artifacts_path
-                    / StandardModelPipeline._layout_model_path
-                }
-            ),
-            TableStructureModel(
-                config={
-                    "artifacts_path": artifacts_path
-                    / StandardModelPipeline._table_model_path,
-                    "enabled": pipeline_options.do_table_structure,
-                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
-                }
-            ),
-        ]
--- a/docling/pipeline/standard_pdf_model_pipeline.py
+++ b/docling/pipeline/standard_pdf_model_pipeline.py
@ -0,0 +1,108 @@
+import logging
+from pathlib import Path
+from typing import Optional
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.datamodel.base_models import AssembledUnit, Page, PdfPipelineOptions
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.models.ds_glm_model import GlmModel
+from docling.models.easyocr_model import EasyOcrModel
+from docling.models.layout_model import LayoutModel
+from docling.models.page_assemble_model import PageAssembleModel
+from docling.models.page_preprocessing_model import PagePreprocessingModel
+from docling.models.table_structure_model import TableStructureModel
+from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
+
+_log = logging.getLogger(__name__)
+
+
+class StandardPdfModelPipeline(PaginatedModelPipeline):
+    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
+    _table_model_path = "model_artifacts/tableformer"
+
+    def __init__(self, pipeline_options: PdfPipelineOptions):
+        super().__init__(pipeline_options)
+
+        if not pipeline_options.artifacts_path:
+            artifacts_path = self.download_models_hf()
+
+        self.artifacts_path = Path(artifacts_path)
+        self.glm_model = GlmModel(config={})
+
+        self.model_pipe = [
+            PagePreprocessingModel(
+                config={"images_scale": pipeline_options.images_scale}
+            ),
+            EasyOcrModel(
+                config={
+                    "lang": ["fr", "de", "es", "en"],
+                    "enabled": pipeline_options.do_ocr,
+                }
+            ),
+            LayoutModel(
+                config={
+                    "artifacts_path": artifacts_path
+                    / StandardPdfModelPipeline._layout_model_path
+                }
+            ),
+            TableStructureModel(
+                config={
+                    "artifacts_path": artifacts_path
+                    / StandardPdfModelPipeline._table_model_path,
+                    "enabled": pipeline_options.do_table_structure,
+                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
+                }
+            ),
+            PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
+        ]
+
+    @staticmethod
+    def download_models_hf(
+        local_dir: Optional[Path] = None, force: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
+        )
+
+        return Path(download_path)
+
+    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+        page._backend = doc._backend.load_page(page.page_no)
+        page.size = page._backend.get_size()
+
+        return page
+
+    def assemble_document(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionResult:
+        all_elements = []
+        all_headers = []
+        all_body = []
+
+        for p in conv_res.pages:
+
+            for el in p.assembled.body:
+                all_body.append(el)
+            for el in p.assembled.headers:
+                all_headers.append(el)
+            for el in p.assembled.elements:
+                all_elements.append(el)
+
+        conv_res.assembled = AssembledUnit(
+            elements=all_elements, headers=all_headers, body=all_body
+        )
+
+        conv_res.output, conv_res.experimental = self.glm_model(conv_res)
+
+        return conv_res
+
+    @classmethod
+    def get_default_options(cls) -> PdfPipelineOptions:
+        return PdfPipelineOptions()
+
+    @classmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        return isinstance(backend, PdfDocumentBackend)
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@ -6,9 +6,9 @@ from typing import Iterable

 import yaml

-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter

 _log = logging.getLogger(__name__)

@ -107,7 +107,11 @@ def main():
    # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
    # input = DocumentConversionInput.from_streams(docs)

-    doc_converter = DocumentConverter()
+    doc_converter = PdfDocumentConverter(
+        pipeline_options=PdfPipelineOptions(),
+        pdf_backend=DocumentConversionInput.DEFAULT_BACKEND,
+        pipeline_cls=StandardModelPipeline,
+    )

    input = DocumentConversionInput.from_paths(input_doc_paths)

--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@ -6,9 +6,9 @@ from typing import Iterable

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter

 _log = logging.getLogger(__name__)

@ -93,12 +93,12 @@ def main():

    # Docling Parse without OCR
    # -------------------------
-    pipeline_options = PipelineOptions()
+    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

-    doc_converter = DocumentConverter(
+    doc_converter = PdfDocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
--- a/examples/export_figures.py
+++ b/examples/export_figures.py
@ -4,14 +4,14 @@ from pathlib import Path
 from typing import Tuple

 from docling.datamodel.base_models import (
-    AssembleOptions,
    ConversionStatus,
    FigureElement,
    PageElement,
+    PdfPipelineOptions,
    Table,
 )
 from docling.datamodel.document import DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter

 _log = logging.getLogger(__name__)

@ -30,12 +30,12 @@ def main():

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
-    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
+    # This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
-    assemble_options = AssembleOptions()
-    assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE

-    doc_converter = DocumentConverter(assemble_options=assemble_options)
+    doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)

    start_time = time.time()

--- a/examples/export_multimodal.py
+++ b/examples/export_multimodal.py
@ -5,9 +5,9 @@ from pathlib import Path

 import pandas as pd

-from docling.datamodel.base_models import AssembleOptions, ConversionStatus
+from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
 from docling.datamodel.document import DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter
 from docling.utils.export import generate_multimodal_pages

 _log = logging.getLogger(__name__)
@ -27,12 +27,12 @@ def main():

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
-    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
+    # This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
-    assemble_options = AssembleOptions()
-    assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE

-    doc_converter = DocumentConverter(assemble_options=assemble_options)
+    doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)

    start_time = time.time()

--- a/examples/export_tables.py
+++ b/examples/export_tables.py
@ -7,7 +7,7 @@ import pandas as pd

 from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter

 _log = logging.getLogger(__name__)

@ -22,7 +22,7 @@ def main():

    input_files = DocumentConversionInput.from_paths(input_doc_paths)

-    doc_converter = DocumentConverter()
+    doc_converter = PdfDocumentConverter()

    start_time = time.time()

--- a/examples/minimal.py
+++ b/examples/minimal.py
@ -1,6 +1,6 @@
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter

 source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
-converter = DocumentConverter()
+converter = PdfDocumentConverter()
 doc = converter.convert_single(source)
 print(doc.render_as_markdown())  # output: ## Docling Technical Report [...]"
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@ -0,0 +1,41 @@
+from pathlib import Path
+
+from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import (
+    InputFormat,
+    PdfPipelineOptions,
+    PipelineOptions,
+)
+from docling.datamodel.document import DocumentConversionInput
+from docling.document_converter import DocumentConverter, FormatOption
+from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
+from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
+
+input_paths = [
+    Path("tests/data/wiki_duck.html"),
+    Path("tests/data/word_sample.docx"),
+    Path("tests/data/powerpoint_sample.pptx"),
+    Path("tests/data/2206.01062.pdf"),
+]
+input = DocumentConversionInput.from_paths(input_paths)
+
+# for defaults use:
+doc_converter = DocumentConverter()
+
+# to customize use:
+# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
+#     formats=[InputFormat.PDF, InputFormat.DOCX],
+#     format_options={
+#         InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
+#         InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
+#     }
+# )
+
+conv_results = doc_converter.convert(input)
+
+for res in conv_results:
+    print(
+        f"Document {res.input.file.name} converted with status {res.status}. Content:"
+    )
+    print(res.experimental.export_to_markdown())
--- a/tests/data/powerpoint_sample.pptx
+++ b/tests/data/powerpoint_sample.pptx
--- a/tests/data/wiki_duck.html
+++ b/tests/data/wiki_duck.html
--- a/tests/data/word_sample.docx
+++ b/tests/data/word_sample.docx
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -2,9 +2,9 @@ from pathlib import Path

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import PipelineOptions
+from docling.datamodel.base_models import PdfPipelineOptions
 from docling.datamodel.document import ConversionResult
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter

 from .verify_utils import verify_conversion_result

@ -23,12 +23,12 @@ def get_pdf_paths():

 def get_converter():

-    pipeline_options = PipelineOptions()
+    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

-    converter = DocumentConverter(
+    converter = PdfDocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@ -5,9 +5,9 @@ import pytest

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import DocumentStream, PipelineOptions
+from docling.datamodel.base_models import DocumentStream, PdfPipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
+from docling.pdf_document_converter import PdfDocumentConverter

 from .verify_utils import verify_conversion_result

@ -21,12 +21,12 @@ def get_pdf_path():
@pytest.fixture
 def converter():

-    pipeline_options = PipelineOptions()
+    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

-    converter = DocumentConverter(
+    converter = PdfDocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
@ -34,7 +34,7 @@ def converter():
    return converter


-def test_convert_single(converter: DocumentConverter):
+def test_convert_single(converter: PdfDocumentConverter):

    pdf_path = get_pdf_path()
    print(f"converting {pdf_path}")
@ -43,7 +43,7 @@ def test_convert_single(converter: DocumentConverter):
    verify_conversion_result(input_path=pdf_path, doc_result=doc_result)


-def test_batch_path(converter: DocumentConverter):
+def test_batch_path(converter: PdfDocumentConverter):

    pdf_path = get_pdf_path()
    print(f"converting {pdf_path}")
@ -55,7 +55,7 @@ def test_batch_path(converter: DocumentConverter):
        verify_conversion_result(input_path=pdf_path, doc_result=doc_result)


-def test_batch_bytes(converter: DocumentConverter):
+def test_batch_bytes(converter: PdfDocumentConverter):

    pdf_path = get_pdf_path()
    print(f"converting {pdf_path}")