feat!: Docling v2 (#117)

--------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-12-09 13:18:24 +00:00 · 2024-10-16 21:02:03 +02:00
parent d504432c1e
commit 7d3be0edeb
144 changed files with 15180 additions and 3828 deletions
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@@ -1,68 +1,63 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
+from typing import TYPE_CHECKING, Set, Union

-from PIL import Image
+from docling_core.types.doc import DoclingDocument

 if TYPE_CHECKING:
-    from docling.datamodel.base_models import BoundingBox, Cell, PageSize
+    from docling.datamodel.base_models import InputFormat
+    from docling.datamodel.document import InputDocument


-class PdfPageBackend(ABC):
-
+class AbstractDocumentBackend(ABC):
    @abstractmethod
-    def get_text_in_rect(self, bbox: "BoundingBox") -> str:
-        pass
-
-    @abstractmethod
-    def get_text_cells(self) -> Iterable["Cell"]:
-        pass
-
-    @abstractmethod
-    def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
-        pass
-
-    @abstractmethod
-    def get_page_image(
-        self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
-    ) -> Image.Image:
-        pass
-
-    @abstractmethod
-    def get_size(self) -> "PageSize":
-        pass
-
-    @abstractmethod
-    def is_valid(self) -> bool:
-        pass
-
-    @abstractmethod
-    def unload(self):
-        pass
-
-
-class PdfDocumentBackend(ABC):
-    @abstractmethod
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        self.path_or_stream = path_or_stream
-        self.document_hash = document_hash
-
-    @abstractmethod
-    def load_page(self, page_no: int) -> PdfPageBackend:
-        pass
-
-    @abstractmethod
-    def page_count(self) -> int:
-        pass
+        self.document_hash = in_doc.document_hash
+        self.input_format = in_doc.format

    @abstractmethod
    def is_valid(self) -> bool:
        pass

+    @classmethod
+    @abstractmethod
+    def supports_pagination(cls) -> bool:
+        pass
+
    @abstractmethod
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()

        self.path_or_stream = None
+
+    @classmethod
+    @abstractmethod
+    def supported_formats(cls) -> Set["InputFormat"]:
+        pass
+
+
+class PaginatedDocumentBackend(AbstractDocumentBackend):
+    """DeclarativeDocumentBackend.
+
+    A declarative document backend is a backend that can transform to DoclingDocument
+    straight without a recognition pipeline.
+    """
+
+    @abstractmethod
+    def page_count(self) -> int:
+        pass
+
+
+class DeclarativeDocumentBackend(AbstractDocumentBackend):
+    """DeclarativeDocumentBackend.
+
+    A declarative document backend is a backend that can transform to DoclingDocument
+    straight without a recognition pipeline.
+    """
+
+    @abstractmethod
+    def convert(self) -> DoclingDocument:
+        pass
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@@ -5,12 +5,14 @@ from pathlib import Path
 from typing import Iterable, List, Optional, Union

 import pypdfium2 as pdfium
+from docling_core.types.doc import BoundingBox, CoordOrigin, Size
 from docling_parse.docling_parse import pdf_parser
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import Cell
+from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)

@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):

        return image

-    def get_size(self) -> PageSize:
-        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+    def get_size(self) -> Size:
+        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())

    def unload(self):
        self._ppage = None
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):


 class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        super().__init__(path_or_stream, document_hash)
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)

-        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
        self.parser = pdf_parser()

        success = False
-        if isinstance(path_or_stream, BytesIO):
+        if isinstance(self.path_or_stream, BytesIO):
            success = self.parser.load_document_from_bytesio(
-                document_hash, path_or_stream
+                self.document_hash, self.path_or_stream
+            )
+        elif isinstance(self.path_or_stream, Path):
+            success = self.parser.load_document(
+                self.document_hash, str(self.path_or_stream)
            )
-        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(document_hash, str(path_or_stream))

        if not success:
            raise RuntimeError(
-                f"docling-parse could not load document {document_hash}."
+                f"docling-parse could not load document with hash {self.document_hash}."
            )

    def page_count(self) -> int:
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@@ -2,15 +2,19 @@ import logging
 import random
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Iterable, List, Optional, Union

 import pypdfium2 as pdfium
+from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_parse.docling_parse import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import Cell, Size
+
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)

@@ -190,8 +194,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):

        return image

-    def get_size(self) -> PageSize:
-        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+    def get_size(self) -> Size:
+        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())

    def unload(self):
        self._ppage = None
@@ -199,23 +203,23 @@ class DoclingParseV2PageBackend(PdfPageBackend):


 class DoclingParseV2DocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        super().__init__(path_or_stream, document_hash)
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)

-        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
        self.parser = pdf_parser_v2("fatal")

        success = False
        if isinstance(path_or_stream, BytesIO):
            success = self.parser.load_document_from_bytesio(
-                document_hash, path_or_stream
+                self.document_hash, path_or_stream
            )
        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(document_hash, str(path_or_stream))
+            success = self.parser.load_document(self.document_hash, str(path_or_stream))

        if not success:
            raise RuntimeError(
-                f"docling-parse could not load document {document_hash}."
+                f"docling-parse v2 could not load document {self.document_hash}."
            )

    def page_count(self) -> int:
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -0,0 +1,425 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from bs4 import BeautifulSoup
+from docling_core.types.doc import (
+    DocItemLabel,
+    DoclingDocument,
+    GroupLabel,
+    TableCell,
+    TableData,
+)
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class HTMLDocumentBackend(DeclarativeDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        _log.debug("About to init HTML backend...")
+        self.soup = None
+        # HTML file:
+        self.path_or_stream = path_or_stream
+        # Initialise the parents for the hierarchy
+        self.max_levels = 10
+        self.level = 0
+        self.parents = {}  # type: ignore
+        for i in range(0, self.max_levels):
+            self.parents[i] = None
+        self.labels = {}  # type: ignore
+
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                text_stream = self.path_or_stream.getvalue().decode("utf-8")
+                self.soup = BeautifulSoup(text_stream, "html.parser")
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                    html_content = f.read()
+                    self.soup = BeautifulSoup(html_content, "html.parser")
+        except Exception as e:
+            raise RuntimeError(
+                f"Could not initialize HTML backend for file with hash {self.document_hash}."
+            ) from e
+
+    def is_valid(self) -> bool:
+        return self.soup is not None
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.HTML}
+
+    def convert(self) -> DoclingDocument:
+        # access self.path_or_stream to load stuff
+        doc = DoclingDocument(name="dummy")
+        _log.debug("Trying to convert HTML...")
+
+        if self.is_valid():
+            assert self.soup is not None
+            # Replace <br> tags with newline characters
+            for br in self.soup.body.find_all("br"):
+                br.replace_with("\n")
+            doc = self.walk(self.soup.body, doc)
+        else:
+            raise RuntimeError(
+                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+            )
+        return doc
+
+    def walk(self, element, doc):
+        try:
+            # Iterate over elements in the body of the document
+            for idx, element in enumerate(element.children):
+                try:
+                    self.analyse_element(element, idx, doc)
+                except Exception as exc_child:
+
+                    _log.error(" -> error treating child: ", exc_child)
+                    _log.error(" => element: ", element, "\n")
+                    raise exc_child
+
+        except Exception as exc:
+            pass
+
+        return doc
+
+    def analyse_element(self, element, idx, doc):
+        """
+        if element.name!=None:
+            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
+        """
+
+        if element.name in self.labels:
+            self.labels[element.name] += 1
+        else:
+            self.labels[element.name] = 1
+
+        if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+            self.handle_header(element, idx, doc)
+        elif element.name in ["p"]:
+            self.handle_paragraph(element, idx, doc)
+        elif element.name in ["ul", "ol"]:
+            self.handle_list(element, idx, doc)
+        elif element.name in ["li"]:
+            self.handle_listitem(element, idx, doc)
+        elif element.name == "table":
+            self.handle_table(element, idx, doc)
+        elif element.name == "figure":
+            self.handle_figure(element, idx, doc)
+        elif element.name == "img":
+            self.handle_image(element, idx, doc)
+        else:
+            self.walk(element, doc)
+
+    def get_direct_text(self, item):
+        """Get the direct text of the <li> element (ignoring nested lists)."""
+        text = item.find(string=True, recursive=False)
+
+        if isinstance(text, str):
+            return text.strip()
+
+        return ""
+
+    # Function to recursively extract text from all child nodes
+    def extract_text_recursively(self, item):
+        result = []
+
+        if isinstance(item, str):
+            return [item]
+
+        result.append(self.get_direct_text(item))
+
+        try:
+            # Iterate over the children (and their text and tails)
+            for child in item:
+                try:
+                    # Recursively get the child's text content
+                    result.extend(self.extract_text_recursively(child))
+                except:
+                    pass
+        except:
+            _log.warn("item has no children")
+            pass
+
+        return " ".join(result)
+
+    def handle_header(self, element, idx, doc):
+        """Handles header tags (h1, h2, etc.)."""
+        hlevel = int(element.name.replace("h", ""))
+        slevel = hlevel - 1
+
+        label = DocItemLabel.SECTION_HEADER
+        text = element.text.strip()
+
+        if hlevel == 1:
+            for key, val in self.parents.items():
+                self.parents[key] = None
+
+            self.level = 1
+            self.parents[self.level] = doc.add_text(
+                parent=self.parents[0], label=DocItemLabel.TITLE, text=text
+            )
+
+        elif hlevel == self.level:
+            self.parents[hlevel] = doc.add_text(
+                parent=self.parents[hlevel - 1], label=label, text=text
+            )
+
+        elif hlevel > self.level:
+
+            # add invisible group
+            for i in range(self.level + 1, hlevel):
+                self.parents[i] = doc.add_group(
+                    name=f"header-{i}",
+                    label=GroupLabel.SECTION,
+                    parent=self.parents[i - 1],
+                )
+
+            self.parents[hlevel] = doc.add_text(
+                parent=self.parents[hlevel - 1], label=label, text=text
+            )
+            self.level = hlevel
+
+        elif hlevel < self.level:
+
+            # remove the tail
+            for key, val in self.parents.items():
+                if key > hlevel:
+                    self.parents[key] = None
+
+            self.parents[hlevel] = doc.add_text(
+                parent=self.parents[hlevel - 1], label=label, text=text
+            )
+            self.level = hlevel
+
+    def handle_paragraph(self, element, idx, doc):
+        """Handles paragraph tags (p)."""
+        if element.text is None:
+            return
+        text = element.text.strip()
+        label = DocItemLabel.PARAGRAPH
+        if len(text) == 0:
+            return
+        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+
+    def handle_list(self, element, idx, doc):
+        """Handles list tags (ul, ol) and their list items."""
+
+        if element.name == "ul":
+            # create a list group
+            self.parents[self.level + 1] = doc.add_group(
+                parent=self.parents[self.level], name="list", label=GroupLabel.LIST
+            )
+        elif element.name == "ol":
+            # create a list group
+            self.parents[self.level + 1] = doc.add_group(
+                parent=self.parents[self.level],
+                name="ordered list",
+                label=GroupLabel.ORDERED_LIST,
+            )
+        self.level += 1
+
+        self.walk(element, doc)
+
+        self.parents[self.level + 1] = None
+        self.level -= 1
+
+    def handle_listitem(self, element, idx, doc):
+        """Handles listitem tags (li)."""
+        nested_lists = element.find(["ul", "ol"])
+
+        parent_list_label = self.parents[self.level].label
+        index_in_list = len(self.parents[self.level].children) + 1
+
+        if nested_lists:
+            name = element.name
+            text = self.get_direct_text(element)
+
+            marker = ""
+            enumerated = False
+            if parent_list_label == GroupLabel.ORDERED_LIST:
+                marker = str(index_in_list)
+                enumerated = True
+
+            # create a list-item
+            self.parents[self.level + 1] = doc.add_list_item(
+                text=text,
+                enumerated=enumerated,
+                marker=marker,
+                parent=self.parents[self.level],
+            )
+            self.level += 1
+
+            self.walk(element, doc)
+
+            self.parents[self.level + 1] = None
+            self.level -= 1
+
+        elif isinstance(element.text, str):
+            text = element.text.strip()
+
+            marker = ""
+            enumerated = False
+            if parent_list_label == GroupLabel.ORDERED_LIST:
+                marker = f"{str(index_in_list)}."
+                enumerated = True
+            doc.add_list_item(
+                text=text,
+                enumerated=enumerated,
+                marker=marker,
+                parent=self.parents[self.level],
+            )
+        else:
+            _log.warn("list-item has no text: ", element)
+
+    def handle_table(self, element, idx, doc):
+        """Handles table tags."""
+
+        nested_tables = element.find("table")
+        if nested_tables is not None:
+            _log.warn("detected nested tables: skipping for now")
+            return
+
+        # Count the number of rows (number of <tr> elements)
+        num_rows = len(element.find_all("tr"))
+
+        # Find the number of columns (taking into account colspan)
+        num_cols = 0
+        for row in element.find_all("tr"):
+            col_count = 0
+            for cell in row.find_all(["td", "th"]):
+                colspan = int(cell.get("colspan", 1))
+                col_count += colspan
+            num_cols = max(num_cols, col_count)
+
+        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+
+        # Iterate over the rows in the table
+        for row_idx, row in enumerate(element.find_all("tr")):
+
+            # For each row, find all the column cells (both <td> and <th>)
+            cells = row.find_all(["td", "th"])
+
+            # Check if each cell in the row is a header -> means it is a column header
+            col_header = True
+            for j, html_cell in enumerate(cells):
+                if html_cell.name == "td":
+                    col_header = False
+
+            col_idx = 0
+            # Extract and print the text content of each cell
+            for _, html_cell in enumerate(cells):
+
+                text = html_cell.text
+                try:
+                    text = self.extract_table_cell_text(html_cell)
+                except Exception as exc:
+                    _log.warn("exception: ", exc)
+                    exit(-1)
+
+                # label = html_cell.name
+
+                col_span = int(html_cell.get("colspan", 1))
+                row_span = int(html_cell.get("rowspan", 1))
+
+                while grid[row_idx][col_idx] is not None:
+                    col_idx += 1
+                for r in range(row_span):
+                    for c in range(col_span):
+                        grid[row_idx + r][col_idx + c] = text
+
+                cell = TableCell(
+                    text=text,
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=row_idx,
+                    end_row_offset_idx=row_idx + row_span,
+                    start_col_offset_idx=col_idx,
+                    end_col_offset_idx=col_idx + col_span,
+                    col_header=col_header,
+                    row_header=((not col_header) and html_cell.name == "th"),
+                )
+                data.table_cells.append(cell)
+
+        doc.add_table(data=data, parent=self.parents[self.level])
+
+    def get_list_text(self, list_element, level=0):
+        """Recursively extract text from <ul> or <ol> with proper indentation."""
+        result = []
+        bullet_char = "*"  # Default bullet character for unordered lists
+
+        if list_element.name == "ol":  # For ordered lists, use numbers
+            for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
+                # Add numbering for ordered lists
+                result.append(f"{'    ' * level}{i}. {li.get_text(strip=True)}")
+                # Handle nested lists
+                nested_list = li.find(["ul", "ol"])
+                if nested_list:
+                    result.extend(self.get_list_text(nested_list, level + 1))
+        elif list_element.name == "ul":  # For unordered lists, use bullet points
+            for li in list_element.find_all("li", recursive=False):
+                # Add bullet points for unordered lists
+                result.append(
+                    f"{'    ' * level}{bullet_char} {li.get_text(strip=True)}"
+                )
+                # Handle nested lists
+                nested_list = li.find(["ul", "ol"])
+                if nested_list:
+                    result.extend(self.get_list_text(nested_list, level + 1))
+
+        return result
+
+    def extract_table_cell_text(self, cell):
+        """Extract text from a table cell, including lists with indents."""
+        contains_lists = cell.find(["ul", "ol"])
+        if contains_lists is None:
+            return cell.text
+        else:
+            _log.debug(
+                "should extract the content correctly for table-cells with lists ..."
+            )
+            return cell.text
+
+    def handle_figure(self, element, idx, doc):
+        """Handles image tags (img)."""
+
+        # Extract the image URI from the <img> tag
+        # image_uri = root.xpath('//figure//img/@src')[0]
+
+        contains_captions = element.find(["figcaption"])
+        if contains_captions is None:
+            doc.add_picture(parent=self.parents[self.level], caption=None)
+
+        else:
+            texts = []
+            for item in contains_captions:
+                texts.append(item.text)
+
+            fig_caption = doc.add_text(
+                label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
+            )
+            doc.add_picture(
+                parent=self.parents[self.level],
+                caption=fig_caption,
+            )
+
+    def handle_image(self, element, idx, doc):
+        """Handles image tags (img)."""
+        doc.add_picture(parent=self.parents[self.level], caption=None)
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -0,0 +1,375 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    ProvenanceItem,
+    Size,
+    TableCell,
+    TableData,
+)
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
+
+from docling.backend.abstract_backend import (
+    DeclarativeDocumentBackend,
+    PaginatedDocumentBackend,
+)
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self.namespaces = {
+            "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+            "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
+            "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
+        }
+        # Powerpoint file:
+        self.path_or_stream = path_or_stream
+
+        self.pptx_obj = None
+        self.valid = False
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.pptx_obj = Presentation(self.path_or_stream)
+            elif isinstance(self.path_or_stream, Path):
+                self.pptx_obj = Presentation(str(self.path_or_stream))
+
+            self.valid = True
+        except Exception as e:
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+            ) from e
+
+        return
+
+    def page_count(self) -> int:
+        if self.is_valid():
+            assert self.pptx_obj is not None
+            return len(self.pptx_obj.slides)
+        else:
+            return 0
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True  # True? if so, how to handle pages...
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.PPTX}
+
+    def convert(self) -> DoclingDocument:
+        # Parses the PPTX into a structured document model.
+        # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
+
+        fname = ""
+        if isinstance(self.path_or_stream, Path):
+            fname = self.path_or_stream.name
+
+        origin = DocumentOrigin(
+            filename=fname,
+            mimetype="application/vnd.ms-powerpoint",
+            binary_hash=self.document_hash,
+        )
+        if len(fname) > 0:
+            docname = Path(fname).stem
+        else:
+            docname = "stream"
+        doc = DoclingDocument(
+            name=docname, origin=origin
+        )  # must add origin information
+        doc = self.walk_linear(self.pptx_obj, doc)
+
+        return doc
+
+    def generate_prov(self, shape, slide_ind, text=""):
+        left = shape.left
+        top = shape.top
+        width = shape.width
+        height = shape.height
+        shape_bbox = [left, top, left + width, top + height]
+        shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
+        # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
+        prov = ProvenanceItem(
+            page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
+        )
+
+        return prov
+
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
+        is_a_list = False
+        enum_list_item_value = 0
+        for paragraph in shape.text_frame.paragraphs:
+            enum_list_item_value += 1
+            bullet_type = "None"
+            # Check if paragraph is a bullet point using the `element` XML
+            p = paragraph._element
+            if (
+                p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
+                is not None
+            ):
+                bullet_type = "Bullet"
+                is_a_list = True
+            elif (
+                p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
+                is not None
+            ):
+                bullet_type = "Numbered"
+                is_a_list = True
+            else:
+                is_a_list = False
+
+            if paragraph.level > 0:
+                # Most likely a sub-list
+                is_a_list = True
+            list_text = paragraph.text.strip()
+
+            prov = self.generate_prov(shape, slide_ind, shape.text.strip())
+
+            if is_a_list:
+                # Determine if this is an unordered list or an ordered list.
+                # Set GroupLabel.ORDERED_LIST when it fits.
+                list_label = GroupLabel.LIST
+                if bullet_type == "Numbered":
+                    list_label = GroupLabel.ORDERED_LIST
+
+                new_list = doc.add_group(
+                    label=list_label, name=f"list", parent=parent_slide
+                )
+            else:
+                new_list = None
+
+            if is_a_list:
+                _log.debug("LIST DETECTED!")
+            else:
+                _log.debug("No List")
+
+            # for e in p.iter():
+            for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
+                if len(e.text.strip()) > 0:
+                    e_is_a_list_item = False
+                    is_numbered = False
+                    if (
+                        p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
+                        is not None
+                    ):
+                        bullet_type = "Bullet"
+                        e_is_a_list_item = True
+                    elif (
+                        p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
+                        is not None
+                    ):
+                        bullet_type = "Numbered"
+                        is_numbered = True
+                        e_is_a_list_item = True
+                    else:
+                        e_is_a_list_item = False
+
+                    if e_is_a_list_item:
+                        # Set marker and enumerated arguments if this is an enumeration element.
+                        enum_marker = str(enum_list_item_value) + "."
+                        doc.add_list_item(
+                            marker=enum_marker,
+                            enumerated=is_numbered,
+                            parent=new_list,
+                            text=list_text,
+                            prov=prov,
+                        )
+                    else:
+                        # Assign proper label to the text, depending if it's a Title or Section Header
+                        # For other types of text, assign - PARAGRAPH
+                        doc_label = DocItemLabel.PARAGRAPH
+                        if shape.is_placeholder:
+                            placeholder_type = shape.placeholder_format.type
+                            if placeholder_type in [
+                                PP_PLACEHOLDER.CENTER_TITLE,
+                                PP_PLACEHOLDER.TITLE,
+                            ]:
+                                # It's a title
+                                doc_label = DocItemLabel.TITLE
+                            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
+                                DocItemLabel.SECTION_HEADER
+
+                        enum_list_item_value = 0
+
+                        doc.add_text(
+                            label=doc_label,
+                            parent=parent_slide,
+                            text=list_text,
+                            prov=prov,
+                        )
+        return
+
+    def handle_title(self, shape, parent_slide, slide_ind, doc):
+        placeholder_type = shape.placeholder_format.type
+        txt = shape.text.strip()
+        prov = self.generate_prov(shape, slide_ind, txt)
+
+        if len(txt.strip()) > 0:
+            # title = slide.shapes.title.text if slide.shapes.title else "No title"
+            if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
+                _log.info(f"Title found: {shape.text}")
+                doc.add_text(
+                    label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
+                )
+            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
+                _log.info(f"Subtitle found: {shape.text}")
+                # Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
+                doc.add_text(
+                    label=DocItemLabel.SECTION_HEADER,
+                    parent=parent_slide,
+                    text=txt,
+                    prov=prov,
+                )
+        return
+
+    def handle_pictures(self, shape, parent_slide, slide_ind, doc):
+        # shape has picture
+        prov = self.generate_prov(shape, slide_ind, "")
+        doc.add_picture(parent=parent_slide, caption=None, prov=prov)
+        return
+
+    def handle_tables(self, shape, parent_slide, slide_ind, doc):
+        # Handling tables, images, charts
+        if shape.has_table:
+            table = shape.table
+            table_xml = shape._element
+
+            prov = self.generate_prov(shape, slide_ind, "")
+
+            num_cols = 0
+            num_rows = len(table.rows)
+            tcells = []
+            # Access the XML element for the shape that contains the table
+            table_xml = shape._element
+
+            for row_idx, row in enumerate(table.rows):
+                if len(row.cells) > num_cols:
+                    num_cols = len(row.cells)
+                for col_idx, cell in enumerate(row.cells):
+                    # Access the XML of the cell (this is the 'tc' element in table XML)
+                    cell_xml = table_xml.xpath(
+                        f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
+                    )
+
+                    if not cell_xml:
+                        continue  # If no cell XML is found, skip
+
+                    cell_xml = cell_xml[0]  # Get the first matching XML node
+                    row_span = cell_xml.get("rowSpan")  # Vertical span
+                    col_span = cell_xml.get("gridSpan")  # Horizontal span
+
+                    if row_span is None:
+                        row_span = 1
+                    else:
+                        row_span = int(row_span)
+
+                    if col_span is None:
+                        col_span = 1
+                    else:
+                        col_span = int(col_span)
+
+                    icell = TableCell(
+                        text=cell.text.strip(),
+                        row_span=row_span,
+                        col_span=col_span,
+                        start_row_offset_idx=row_idx,
+                        end_row_offset_idx=row_idx + row_span,
+                        start_col_offset_idx=col_idx,
+                        end_col_offset_idx=col_idx + col_span,
+                        col_header=False,
+                        row_header=False,
+                    )
+                    if len(cell.text.strip()) > 0:
+                        tcells.append(icell)
+            # Initialize Docling TableData
+            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+            # Populate
+            for tcell in tcells:
+                data.table_cells.append(tcell)
+            if len(tcells) > 0:
+                # If table is not fully empty...
+                # Create Docling table
+                doc.add_table(data=data, prov=prov)
+        return
+
+    def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
+        # Units of size in PPTX by default are EMU units (English Metric Units)
+        slide_width = pptx_obj.slide_width
+        slide_height = pptx_obj.slide_height
+
+        text_content = []  # type: ignore
+
+        max_levels = 10
+        parents = {}  # type: ignore
+        for i in range(0, max_levels):
+            parents[i] = None
+
+        # Loop through each slide
+        for slide_num, slide in enumerate(pptx_obj.slides):
+            slide_ind = pptx_obj.slides.index(slide)
+            parent_slide = doc.add_group(
+                name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
+            )
+
+            size = Size(width=slide_width, height=slide_height)
+            parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
+            # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
+
+            # Loop through each shape in the slide
+            for shape in slide.shapes:
+
+                if shape.has_table:
+                    # Handle Tables
+                    self.handle_tables(shape, parent_slide, slide_ind, doc)
+
+                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                    # Handle Tables
+                    self.handle_pictures(shape, parent_slide, slide_ind, doc)
+
+                # If shape doesn't have any text, move on to the next shape
+                if not hasattr(shape, "text"):
+                    continue
+                if shape.text is None:
+                    continue
+                if len(shape.text.strip()) == 0:
+                    continue
+                if not shape.has_text_frame:
+                    _log.warn("Warning: shape has text but not text_frame")
+                    continue
+
+                # if shape.is_placeholder:
+                # Handle Titles (Headers) and Subtitles
+                # Check if the shape is a placeholder (titles are placeholders)
+                # self.handle_title(shape, parent_slide, slide_ind, doc)
+                # self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+                # else:
+
+                # Handle other text elements, including lists (bullet lists, numbered lists)
+                self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+
+                # figures...
+                # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
+
+        return doc
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -0,0 +1,509 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+import docx
+from docling_core.types.doc import (
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    TableCell,
+    TableData,
+)
+from lxml import etree
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class MsWordDocumentBackend(DeclarativeDocumentBackend):
+
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self.XML_KEY = (
+            "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
+        )
+        self.xml_namespaces = {
+            "w": "http://schemas.microsoft.com/office/word/2003/wordml"
+        }
+        # self.initialise(path_or_stream)
+        # Word file:
+        self.path_or_stream = path_or_stream
+        self.valid = False
+        # Initialise the parents for the hierarchy
+        self.max_levels = 10
+        self.level_at_new_list = None
+        self.parents = {}  # type: ignore
+        for i in range(-1, self.max_levels):
+            self.parents[i] = None
+
+        self.level = 0
+        self.listIter = 0
+
+        self.history = {
+            "names": [None],
+            "levels": [None],
+            "numids": [None],
+            "indents": [None],
+        }
+
+        self.docx_obj = None
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.docx_obj = docx.Document(self.path_or_stream)
+            elif isinstance(self.path_or_stream, Path):
+                self.docx_obj = docx.Document(str(self.path_or_stream))
+
+            self.valid = True
+        except Exception as e:
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+            ) from e
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.DOCX}
+
+    def convert(self) -> DoclingDocument:
+        # Parses the DOCX into a structured document model.
+
+        fname = ""
+        if isinstance(self.path_or_stream, Path):
+            fname = self.path_or_stream.name
+
+        origin = DocumentOrigin(
+            filename=fname,
+            mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            binary_hash=self.document_hash,
+        )
+        if len(fname) > 0:
+            docname = Path(fname).stem
+        else:
+            docname = "stream"
+        doc = DoclingDocument(name=docname, origin=origin)
+        if self.is_valid():
+            assert self.docx_obj is not None
+            doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
+            return doc
+        else:
+            raise RuntimeError(
+                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+            )
+
+    def update_history(self, name, level, numid, ilevel):
+        self.history["names"].append(name)
+        self.history["levels"].append(level)
+
+        self.history["numids"].append(numid)
+        self.history["indents"].append(ilevel)
+
+    def prev_name(self):
+        return self.history["names"][-1]
+
+    def prev_level(self):
+        return self.history["levels"][-1]
+
+    def prev_numid(self):
+        return self.history["numids"][-1]
+
+    def prev_indent(self):
+        return self.history["indents"][-1]
+
+    def get_level(self) -> int:
+        """Return the first None index."""
+        for k, v in self.parents.items():
+            if k >= 0 and v == None:
+                return k
+        return 0
+
+    def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
+        for element in body:
+            tag_name = etree.QName(element).localname
+
+            # Check for Inline Images (drawings or blip elements)
+            found_drawing = etree.ElementBase.xpath(
+                element, ".//w:drawing", namespaces=self.xml_namespaces
+            )
+            found_pict = etree.ElementBase.xpath(
+                element, ".//w:pict", namespaces=self.xml_namespaces
+            )
+
+            # Check for Tables
+            if element.tag.endswith("tbl"):
+                try:
+                    self.handle_tables(element, docx_obj, doc)
+                except Exception:
+                    _log.debug("could not parse a table, broken docx table")
+
+            elif found_drawing or found_pict:
+                self.handle_pictures(element, docx_obj, doc)
+            # Check for Text
+            elif tag_name in ["p"]:
+                self.handle_text_elements(element, docx_obj, doc)
+            else:
+                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
+        return doc
+
+    def str_to_int(self, s, default=0):
+        if s is None:
+            return None
+        try:
+            return int(s)
+        except ValueError:
+            return default
+
+    def get_numId_and_ilvl(self, paragraph):
+        # Access the XML element of the paragraph
+        numPr = paragraph._element.find(
+            ".//w:numPr", namespaces=paragraph._element.nsmap
+        )
+
+        if numPr is not None:
+            # Get the numId element and extract the value
+            numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
+            ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
+            numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
+            ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
+
+            return self.str_to_int(numId, default=None), self.str_to_int(
+                ilvl, default=None
+            )
+
+        return None, None  # If the paragraph is not part of a list
+
+    def get_label_and_level(self, paragraph):
+        if paragraph.style is None:
+            return "Normal", None
+        label = paragraph.style.name
+        if label is None:
+            return "Normal", None
+        if ":" in label:
+            parts = label.split(":")
+
+            if len(parts) == 2:
+                return parts[0], int(parts[1])
+
+        parts = label.split(" ")
+
+        if "Heading" in label and len(parts) == 2:
+            parts.sort()
+            label_str = ""
+            label_level = 0
+            if parts[0] == "Heading":
+                # print("{} - {}".format(parts[0], parts[1]))
+                label_str = parts[0]
+                label_level = self.str_to_int(parts[1], default=None)
+            if parts[1] == "Heading":
+                label_str = parts[1]
+                label_level = self.str_to_int(parts[0], default=None)
+            return label_str, label_level
+        else:
+            return label, None
+
+    def handle_text_elements(self, element, docx_obj, doc):
+        paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
+
+        if paragraph.text is None:
+            # _log.warn(f"paragraph has text==None")
+            return
+
+        text = paragraph.text.strip()
+        # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
+
+        # Common styles for bullet and numbered lists.
+        # "List Bullet", "List Number", "List Paragraph"
+        # TODO: reliably identify wether list is a numbered list or not
+        # is_numbered = "List Bullet" not in paragraph.style.name
+        is_numbered = False
+
+        p_style_name, p_level = self.get_label_and_level(paragraph)
+        numid, ilevel = self.get_numId_and_ilvl(paragraph)
+        # print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
+
+        if numid == 0:
+            numid = None
+
+        # Handle lists
+        if numid is not None and ilevel is not None:
+            self.add_listitem(
+                element,
+                docx_obj,
+                doc,
+                p_style_name,
+                p_level,
+                numid,
+                ilevel,
+                text,
+                is_numbered,
+            )
+            self.update_history(p_style_name, p_level, numid, ilevel)
+            return
+        elif numid is None and self.prev_numid() is not None:  # Close list
+            for key, val in self.parents.items():
+                if key >= self.level_at_new_list:
+                    self.parents[key] = None
+            self.level = self.level_at_new_list - 1
+            self.level_at_new_list = None
+        if p_style_name in ["Title"]:
+            for key, val in self.parents.items():
+                self.parents[key] = None
+            self.parents[0] = doc.add_text(
+                parent=None, label=DocItemLabel.TITLE, text=text
+            )
+        elif "Heading" in p_style_name:
+            self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
+
+        elif p_style_name in [
+            "Paragraph",
+            "Normal",
+            "Subtitle",
+            "Author",
+            "Default Text",
+            "List Paragraph",
+            "List Bullet",
+            "Quote",
+        ]:
+            level = self.get_level()
+            doc.add_text(
+                label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
+            )
+
+        else:
+            # Text style names can, and will have, not only default values but user values too
+            # hence we treat all other labels as pure text
+            level = self.get_level()
+            doc.add_text(
+                label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
+            )
+
+        self.update_history(p_style_name, p_level, numid, ilevel)
+        return
+
+    def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
+        level = self.get_level()
+        if isinstance(curr_level, int):
+
+            if curr_level == level:
+
+                self.parents[level] = doc.add_heading(
+                    parent=self.parents[level - 1], text=text
+                )
+
+            elif curr_level > level:
+
+                # add invisible group
+                for i in range(level, curr_level):
+                    self.parents[i] = doc.add_group(
+                        parent=self.parents[i - 1],
+                        label=GroupLabel.SECTION,
+                        name=f"header-{i}",
+                    )
+
+                self.parents[curr_level] = doc.add_heading(
+                    parent=self.parents[curr_level - 1], text=text
+                )
+
+            elif curr_level < level:
+
+                # remove the tail
+                for key, val in self.parents.items():
+                    if key >= curr_level:
+                        self.parents[key] = None
+
+                self.parents[curr_level] = doc.add_heading(
+                    parent=self.parents[curr_level - 1], text=text
+                )
+
+        else:
+            self.parents[self.level] = doc.add_heading(
+                parent=self.parents[self.level - 1], text=text
+            )
+        return
+
+    def add_listitem(
+        self,
+        element,
+        docx_obj,
+        doc,
+        p_style_name,
+        p_level,
+        numid,
+        ilevel,
+        text: str,
+        is_numbered=False,
+    ):
+        # is_numbered = is_numbered
+        enum_marker = ""
+
+        level = self.get_level()
+        if self.prev_numid() is None:  # Open new list
+            self.level_at_new_list = level  # type: ignore
+
+            self.parents[level] = doc.add_group(
+                label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
+            )
+
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            self.listIter += 1
+            if is_numbered:
+                enum_marker = str(self.listIter) + "."
+                is_numbered = True
+            doc.add_list_item(
+                marker=enum_marker,
+                enumerated=is_numbered,
+                parent=self.parents[level],
+                text=text,
+            )
+
+        elif (
+            self.prev_numid() == numid and self.prev_indent() < ilevel
+        ):  # Open indented list
+            for i in range(
+                self.level_at_new_list + self.prev_indent() + 1,
+                self.level_at_new_list + ilevel + 1,
+            ):
+                # TODO: determine if this is an unordered list or an ordered list.
+                #  Set GroupLabel.ORDERED_LIST when it fits.
+                self.listIter = 0
+                if is_numbered:
+                    self.parents[i] = doc.add_group(
+                        label=GroupLabel.ORDERED_LIST,
+                        name="list",
+                        parent=self.parents[i - 1],
+                    )
+                else:
+                    self.parents[i] = doc.add_group(
+                        label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
+                    )
+
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            self.listIter += 1
+            if is_numbered:
+                enum_marker = str(self.listIter) + "."
+                is_numbered = True
+            doc.add_list_item(
+                marker=enum_marker,
+                enumerated=is_numbered,
+                parent=self.parents[self.level_at_new_list + ilevel],
+                text=text,
+            )
+
+        elif self.prev_numid() == numid and ilevel < self.prev_indent():  # Close list
+            for k, v in self.parents.items():
+                if k > self.level_at_new_list + ilevel:
+                    self.parents[k] = None
+
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            self.listIter += 1
+            if is_numbered:
+                enum_marker = str(self.listIter) + "."
+                is_numbered = True
+            doc.add_list_item(
+                marker=enum_marker,
+                enumerated=is_numbered,
+                parent=self.parents[self.level_at_new_list + ilevel],
+                text=text,
+            )
+            self.listIter = 0
+
+        elif self.prev_numid() == numid or self.prev_indent() == ilevel:
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            self.listIter += 1
+            if is_numbered:
+                enum_marker = str(self.listIter) + "."
+                is_numbered = True
+            doc.add_list_item(
+                marker=enum_marker,
+                enumerated=is_numbered,
+                parent=self.parents[level - 1],
+                text=text,
+            )
+        return
+
+    def handle_tables(self, element, docx_obj, doc):
+
+        # Function to check if a cell has a colspan (gridSpan)
+        def get_colspan(cell):
+            grid_span = cell._element.xpath("@w:gridSpan")
+            if grid_span:
+                return int(grid_span[0])  # Return the number of columns spanned
+            return 1  # Default is 1 (no colspan)
+
+        # Function to check if a cell has a rowspan (vMerge)
+        def get_rowspan(cell):
+            v_merge = cell._element.xpath("@w:vMerge")
+            if v_merge:
+                return v_merge[
+                    0
+                ]  # 'restart' indicates the beginning of a rowspan, others are continuation
+            return 1
+
+        table = docx.table.Table(element, docx_obj)
+
+        num_rows = len(table.rows)
+        num_cols = 0
+        for row in table.rows:
+            # Calculate the max number of columns
+            num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
+            # if row.cells:
+            #     num_cols = max(num_cols, len(row.cells))
+
+        # Initialize the table grid
+        table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+
+        for row_idx, row in enumerate(table.rows):
+            col_idx = 0
+            for c, cell in enumerate(row.cells):
+                row_span = get_rowspan(cell)
+                col_span = get_colspan(cell)
+
+                # Find the next available column in the grid
+                while table_grid[row_idx][col_idx] is not None:
+                    col_idx += 1
+
+                # Fill the grid with the cell value, considering rowspan and colspan
+                for i in range(row_span if row_span == "restart" else 1):
+                    for j in range(col_span):
+                        table_grid[row_idx + i][col_idx + j] = ""
+
+                cell = TableCell(
+                    text=cell.text,
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=row_idx,
+                    end_row_offset_idx=row_idx + row_span,
+                    start_col_offset_idx=col_idx,
+                    end_col_offset_idx=col_idx + col_span,
+                    col_header=False,  # col_header,
+                    row_header=False,  # ((not col_header) and html_cell.name=='th')
+                )
+
+                data.table_cells.append(cell)
+
+        level = self.get_level()
+        doc.add_table(data=data, parent=self.parents[level - 1])
+        return
+
+    def handle_pictures(self, element, docx_obj, doc):
+        doc.add_picture(parent=self.parents[self.level], caption=None)
+        return
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@@ -0,0 +1,78 @@
+from abc import ABC, abstractmethod
+from io import BytesIO
+from pathlib import Path
+from typing import Iterable, Optional, Set, Union
+
+from docling_core.types.doc import BoundingBox, Size
+from PIL import Image
+
+from docling.backend.abstract_backend import PaginatedDocumentBackend
+from docling.datamodel.base_models import Cell, InputFormat
+from docling.datamodel.document import InputDocument
+
+
+class PdfPageBackend(ABC):
+
+    @abstractmethod
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        pass
+
+    @abstractmethod
+    def get_text_cells(self) -> Iterable[Cell]:
+        pass
+
+    @abstractmethod
+    def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
+        pass
+
+    @abstractmethod
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+        pass
+
+    @abstractmethod
+    def get_size(self) -> Size:
+        pass
+
+    @abstractmethod
+    def is_valid(self) -> bool:
+        pass
+
+    @abstractmethod
+    def unload(self):
+        pass
+
+
+class PdfDocumentBackend(PaginatedDocumentBackend):
+
+    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        if self.input_format is not InputFormat.PDF:
+            if self.input_format is InputFormat.IMAGE:
+                buf = BytesIO()
+                img = Image.open(self.path_or_stream)
+                img.save(buf, "PDF")
+                buf.seek(0)
+                self.path_or_stream = buf
+            else:
+                raise RuntimeError(
+                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
+                )
+
+    @abstractmethod
+    def load_page(self, page_no: int) -> PdfPageBackend:
+        pass
+
+    @abstractmethod
+    def page_count(self) -> int:
+        pass
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.PDF}
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -2,16 +2,20 @@ import logging
 import random
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Iterable, List, Optional, Union

 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
+from docling_core.types.doc import BoundingBox, CoordOrigin, Size
 from PIL import Image, ImageDraw
-from pypdfium2 import PdfPage, PdfTextPage
+from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError

-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import Cell
+
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)

@@ -222,8 +226,8 @@ class PyPdfiumPageBackend(PdfPageBackend):

        return image

-    def get_size(self) -> PageSize:
-        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+    def get_size(self) -> Size:
+        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())

    def unload(self):
        self._ppage = None
@@ -231,13 +235,14 @@ class PyPdfiumPageBackend(PdfPageBackend):


 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        super().__init__(path_or_stream, document_hash)
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
        try:
-            self._pdoc = pdfium.PdfDocument(path_or_stream)
+            self._pdoc = pdfium.PdfDocument(self.path_or_stream)
        except PdfiumError as e:
            raise RuntimeError(
-                f"pypdfium could not load document {document_hash}"
+                f"pypdfium could not load document with hash {self.document_hash}"
            ) from e

    def page_count(self) -> int:
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -5,22 +5,27 @@ import time
 import warnings
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Iterable, List, Optional
+from typing import Annotated, Dict, Iterable, List, Optional

 import typer
 from docling_core.utils.file import resolve_file_source

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    FormatToExtensions,
+    InputFormat,
+    OutputFormat,
+)
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
-    PipelineOptions,
+    OcrOptions,
+    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
-from docling.document_converter import DocumentConverter
+from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -87,28 +92,28 @@ def export_documents(
                fname = output_dir / f"{doc_filename}.json"
                with fname.open("w") as fp:
                    _log.info(f"writing JSON output to {fname}")
-                    fp.write(json.dumps(conv_res.render_as_dict()))
+                    fp.write(json.dumps(conv_res.document.export_to_dict()))

            # Export Text format:
            if export_txt:
                fname = output_dir / f"{doc_filename}.txt"
                with fname.open("w") as fp:
                    _log.info(f"writing Text output to {fname}")
-                    fp.write(conv_res.render_as_text())
+                    fp.write(conv_res.document.export_to_markdown(strict_text=True))

            # Export Markdown format:
            if export_md:
                fname = output_dir / f"{doc_filename}.md"
                with fname.open("w") as fp:
                    _log.info(f"writing Markdown output to {fname}")
-                    fp.write(conv_res.render_as_markdown())
+                    fp.write(conv_res.document.export_to_markdown())

            # Export Document Tags format:
            if export_doctags:
                fname = output_dir / f"{doc_filename}.doctags"
                with fname.open("w") as fp:
                    _log.info(f"writing Doc Tags output to {fname}")
-                    fp.write(conv_res.render_as_doctags())
+                    fp.write(conv_res.document.export_to_document_tokens())

        else:
            _log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -129,44 +134,31 @@ def convert(
            help="PDF files to convert. Can be local file / directory paths or URL.",
        ),
    ],
-    export_json: Annotated[
-        bool,
-        typer.Option(
-            ..., "--json/--no-json", help="If enabled the document is exported as JSON."
-        ),
-    ] = False,
-    export_md: Annotated[
-        bool,
-        typer.Option(
-            ..., "--md/--no-md", help="If enabled the document is exported as Markdown."
-        ),
-    ] = True,
-    export_txt: Annotated[
-        bool,
-        typer.Option(
-            ..., "--txt/--no-txt", help="If enabled the document is exported as Text."
-        ),
-    ] = False,
-    export_doctags: Annotated[
-        bool,
-        typer.Option(
-            ...,
-            "--doctags/--no-doctags",
-            help="If enabled the document is exported as Doc Tags.",
-        ),
-    ] = False,
+    from_formats: List[InputFormat] = typer.Option(
+        None,
+        "--from",
+        help="Specify input formats to convert from. Defaults to all formats.",
+    ),
+    to_formats: List[OutputFormat] = typer.Option(
+        None, "--to", help="Specify output formats. Defaults to Markdown."
+    ),
    ocr: Annotated[
        bool,
        typer.Option(
            ..., help="If enabled, the bitmap content will be processed using OCR."
        ),
    ] = True,
-    backend: Annotated[
-        Backend, typer.Option(..., help="The PDF backend to use.")
-    ] = Backend.DOCLING,
    ocr_engine: Annotated[
        OcrEngine, typer.Option(..., help="The OCR engine to use.")
    ] = OcrEngine.EASYOCR,
+    abort_on_error: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--abort-on-error/--no-abort-on-error",
+            help="If enabled, the bitmap content will be processed using OCR.",
+        ),
+    ] = False,
    output: Annotated[
        Path, typer.Option(..., help="Output directory where results are saved.")
    ] = Path("."),
@@ -182,6 +174,9 @@ def convert(
 ):
    logging.basicConfig(level=logging.INFO)

+    if from_formats is None:
+        from_formats = [e for e in InputFormat]
+
    input_doc_paths: List[Path] = []
    for src in input_sources:
        source = resolve_file_source(source=src)
@@ -191,48 +186,54 @@ def convert(
            )
            raise typer.Abort()
        elif source.is_dir():
-            input_doc_paths.extend(list(source.glob("**/*.pdf")))
-            input_doc_paths.extend(list(source.glob("**/*.PDF")))
+            for fmt in from_formats:
+                for ext in FormatToExtensions[fmt]:
+                    input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
+                    input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
        else:
            input_doc_paths.append(source)

-    match backend:
-        case Backend.PYPDFIUM2:
-            do_cell_matching = ocr  # only do cell matching when OCR enabled
-            pdf_backend = PyPdfiumDocumentBackend
-        case Backend.DOCLING:
-            do_cell_matching = True
-            pdf_backend = DoclingParseDocumentBackend
-        case _:
-            raise RuntimeError(f"Unexpected backend type {backend}")
+    if to_formats is None:
+        to_formats = [OutputFormat.MARKDOWN]
+
+    export_json = OutputFormat.JSON in to_formats
+    export_md = OutputFormat.MARKDOWN in to_formats
+    export_txt = OutputFormat.TEXT in to_formats
+    export_doctags = OutputFormat.DOCTAGS in to_formats

    match ocr_engine:
        case OcrEngine.EASYOCR:
-            ocr_options = EasyOcrOptions()
+            ocr_options: OcrOptions = EasyOcrOptions()
        case OcrEngine.TESSERACT_CLI:
            ocr_options = TesseractCliOcrOptions()
        case OcrEngine.TESSERACT:
            ocr_options = TesseractOcrOptions()
        case _:
-            raise RuntimeError(f"Unexpected backend type {backend}")
+            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

-    pipeline_options = PipelineOptions(
+    pipeline_options = PdfPipelineOptions(
        do_ocr=ocr,
        ocr_options=ocr_options,
        do_table_structure=True,
    )
-    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
-    doc_converter = DocumentConverter(
-        pipeline_options=pipeline_options,
-        pdf_backend=pdf_backend,
-    )
+    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching

-    # Define input files
-    input = DocumentConversionInput.from_paths(input_doc_paths)
+    format_options: Dict[InputFormat, FormatOption] = {
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_options=pipeline_options,
+            backend=DoclingParseDocumentBackend,  # pdf_backend
+        )
+    }
+    doc_converter = DocumentConverter(
+        allowed_formats=from_formats,
+        format_options=format_options,
+    )

    start_time = time.time()

-    conv_results = doc_converter.convert(input)
+    conv_results = doc_converter.convert_all(
+        input_doc_paths, raises_on_error=abort_on_error
+    )

    output.mkdir(parents=True, exist_ok=True)
    export_documents(
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,18 +1,19 @@
-import copy
-import warnings
 from enum import Enum, auto
 from io import BytesIO
-from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union

-from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict, Field, model_validator
-from typing_extensions import Self
-
-from docling.backend.abstract_backend import PdfPageBackend
-from docling.datamodel.pipeline_options import (  # Must be imported here for backward compatibility.
-    PipelineOptions,
-    TableStructureOptions,
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItemLabel,
+    PictureDataType,
+    Size,
+    TableCell,
 )
+from PIL.Image import Image
+from pydantic import BaseModel, ConfigDict
+
+if TYPE_CHECKING:
+    from docling.backend.pdf_backend import PdfPageBackend


 class ConversionStatus(str, Enum):
@@ -23,18 +24,61 @@ class ConversionStatus(str, Enum):
    PARTIAL_SUCCESS = auto()


+class InputFormat(str, Enum):
+    DOCX = "docx"
+    PPTX = "pptx"
+    HTML = "html"
+    IMAGE = "image"
+    PDF = "pdf"
+
+
+class OutputFormat(str, Enum):
+    MARKDOWN = "md"
+    JSON = "json"
+    TEXT = "text"
+    DOCTAGS = "doctags"
+
+
+FormatToExtensions: Dict[InputFormat, List[str]] = {
+    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
+    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
+    InputFormat.PDF: ["pdf"],
+    InputFormat.HTML: ["html", "htm", "xhtml"],
+    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
+}
+
+FormatToMimeType: Dict[InputFormat, Set[str]] = {
+    InputFormat.DOCX: {
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+    },
+    InputFormat.PPTX: {
+        "application/vnd.openxmlformats-officedocument.presentationml.template",
+        "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    },
+    InputFormat.HTML: {"text/html", "application/xhtml+xml"},
+    InputFormat.IMAGE: {
+        "image/png",
+        "image/jpeg",
+        "image/tiff",
+        "image/gif",
+        "image/bmp",
+    },
+    InputFormat.PDF: {"application/pdf"},
+}
+MimeTypeToFormat = {
+    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
+}
+
+
 class DocInputType(str, Enum):
    PATH = auto()
    STREAM = auto()


-class CoordOrigin(str, Enum):
-    TOPLEFT = auto()
-    BOTTOMLEFT = auto()
-
-
 class DoclingComponentType(str, Enum):
-    PDF_BACKEND = auto()
+    DOCUMENT_BACKEND = auto()
    MODEL = auto()
    DOC_ASSEMBLER = auto()

@@ -45,118 +89,6 @@ class ErrorItem(BaseModel):
    error_message: str


-class PageSize(BaseModel):
-    width: float = 0.0
-    height: float = 0.0
-
-
-class BoundingBox(BaseModel):
-    l: float  # left
-    t: float  # top
-    r: float  # right
-    b: float  # bottom
-
-    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
-
-    @property
-    def width(self):
-        return self.r - self.l
-
-    @property
-    def height(self):
-        return abs(self.t - self.b)
-
-    def scaled(self, scale: float) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l *= scale
-        out_bbox.r *= scale
-        out_bbox.t *= scale
-        out_bbox.b *= scale
-
-        return out_bbox
-
-    def normalized(self, page_size: PageSize) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l /= page_size.width
-        out_bbox.r /= page_size.width
-        out_bbox.t /= page_size.height
-        out_bbox.b /= page_size.height
-
-        return out_bbox
-
-    def as_tuple(self):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return (self.l, self.t, self.r, self.b)
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return (self.l, self.b, self.r, self.t)
-
-    @classmethod
-    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
-        if origin == CoordOrigin.TOPLEFT:
-            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b < t:
-                b, t = t, b
-
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-        elif origin == CoordOrigin.BOTTOMLEFT:
-            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b > t:
-                b, t = t, b
-
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-
-    def area(self) -> float:
-        area = (self.r - self.l) * (self.b - self.t)
-        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            area = -area
-        return area
-
-    def intersection_area_with(self, other: "BoundingBox") -> float:
-        # Calculate intersection coordinates
-        left = max(self.l, other.l)
-        top = max(self.t, other.t)
-        right = min(self.r, other.r)
-        bottom = min(self.b, other.b)
-
-        # Calculate intersection dimensions
-        width = right - left
-        height = bottom - top
-
-        # If the bounding boxes do not overlap, width or height will be negative
-        if width <= 0 or height <= 0:
-            return 0.0
-
-        return width * height
-
-    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
-        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.TOPLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,
-                b=page_height - self.b,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            )
-
-    def to_top_left_origin(self, page_height):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,  # self.b
-                b=page_height - self.b,  # self.t
-                coord_origin=CoordOrigin.TOPLEFT,
-            )
-
-
 class Cell(BaseModel):
    id: int
    text: str
@@ -169,14 +101,14 @@ class OcrCell(Cell):

 class Cluster(BaseModel):
    id: int
-    label: str
+    label: DocItemLabel
    bbox: BoundingBox
    confidence: float = 1.0
    cells: List[Cell] = []


 class BasePageElement(BaseModel):
-    label: str
+    label: DocItemLabel
    id: int
    page_no: int
    cluster: Cluster
@@ -187,37 +119,7 @@ class LayoutPrediction(BaseModel):
    clusters: List[Cluster] = []


-class TableCell(BaseModel):
-    bbox: BoundingBox
-    row_span: int
-    col_span: int
-    start_row_offset_idx: int
-    end_row_offset_idx: int
-    start_col_offset_idx: int
-    end_col_offset_idx: int
-    text: str
-    column_header: bool = False
-    row_header: bool = False
-    row_section: bool = False
-
-    @model_validator(mode="before")
-    @classmethod
-    def from_dict_format(cls, data: Any) -> Any:
-        if isinstance(data, Dict):
-            text = data["bbox"].get("token", "")
-            if not len(text):
-                text_cells = data.pop("text_cell_bboxes", None)
-                if text_cells:
-                    for el in text_cells:
-                        text += el["token"] + " "
-
-                text = text.strip()
-            data["text"] = text
-
-        return data
-
-
-class TableElement(BasePageElement):
+class Table(BasePageElement):
    otsl_seq: List[str]
    num_rows: int = 0
    num_cols: int = 0
@@ -225,18 +127,15 @@ class TableElement(BasePageElement):


 class TableStructurePrediction(BaseModel):
-    table_map: Dict[int, TableElement] = {}
+    table_map: Dict[int, Table] = {}


-class TextElement(BasePageElement): ...
-
-
-class FigureData(BaseModel):
-    pass
+class TextElement(BasePageElement):
+    text: str


 class FigureElement(BasePageElement):
-    data: Optional[FigureData] = None
+    annotations: List[PictureDataType] = []
    provenance: Optional[str] = None
    predicted_class: Optional[str] = None
    confidence: Optional[float] = None
@@ -259,7 +158,7 @@ class PagePredictions(BaseModel):
    equations_prediction: Optional[EquationPrediction] = None


-PageElement = Union[TextElement, TableElement, FigureElement]
+PageElement = Union[TextElement, Table, FigureElement]


 class AssembledUnit(BaseModel):
@@ -272,13 +171,13 @@ class Page(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    page_no: int
-    page_hash: Optional[str] = None
-    size: Optional[PageSize] = None
+    # page_hash: Optional[str] = None
+    size: Optional[Size] = None
    cells: List[Cell] = []
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None

-    _backend: Optional[PdfPageBackend] = (
+    _backend: Optional["PdfPageBackend"] = (
        None  # Internal PDF backend. By default it is cleared during assembling.
    )
    _default_image_scale: float = 1.0  # Default image scale for external usage.
@@ -301,24 +200,5 @@ class Page(BaseModel):
 class DocumentStream(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

-    filename: str
+    name: str
    stream: BytesIO
-
-
-class AssembleOptions(BaseModel):
-    keep_page_images: Annotated[
-        bool,
-        Field(
-            deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
-        ),
-    ] = False  # False: page images are removed in the assemble step
-    images_scale: Optional[float] = None  # if set, the scale for generated images
-
-    @model_validator(mode="after")
-    def set_page_images_from_deprecated(self) -> Self:
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", DeprecationWarning)
-            default_scale = 1.0
-            if self.keep_page_images and self.images_scale is None:
-                self.images_scale = default_scale
-        return self
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -1,87 +1,101 @@
 import logging
+import re
+from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union

-from docling_core.types import BaseCell, BaseText
+import filetype
+from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
-from docling_core.types import TableCell
-from docling_core.types.doc.base import BoundingBox as DsBoundingBox
-from docling_core.types.doc.base import Figure
+from docling_core.types.doc import (
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    PictureItem,
+    SectionHeaderItem,
+    TableItem,
+    TextItem,
+)
+from docling_core.types.doc.document import ListItem
+from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
+from docling_core.utils.file import resolve_file_source
 from pydantic import BaseModel
 from typing_extensions import deprecated

-from docling.backend.abstract_backend import PdfDocumentBackend
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.abstract_backend import (
+    AbstractDocumentBackend,
+    PaginatedDocumentBackend,
+)
 from docling.datamodel.base_models import (
    AssembledUnit,
    ConversionStatus,
    DocumentStream,
    ErrorItem,
-    FigureElement,
+    InputFormat,
+    MimeTypeToFormat,
    Page,
-    PageElement,
-    TableElement,
-    TextElement,
 )
 from docling.datamodel.settings import DocumentLimits
-from docling.utils.utils import create_file_hash
+from docling.utils.utils import create_file_hash, create_hash
+
+if TYPE_CHECKING:
+    from docling.document_converter import FormatOption

 _log = logging.getLogger(__name__)

 layout_label_to_ds_type = {
-    "Title": "title",
-    "Document Index": "table-of-path_or_stream",
-    "Section-header": "subtitle-level-1",
-    "Checkbox-Selected": "checkbox-selected",
-    "Checkbox-Unselected": "checkbox-unselected",
-    "Caption": "caption",
-    "Page-header": "page-header",
-    "Page-footer": "page-footer",
-    "Footnote": "footnote",
-    "Table": "table",
-    "Formula": "equation",
-    "List-item": "paragraph",
-    "Code": "paragraph",
-    "Picture": "figure",
-    "Text": "paragraph",
+    DocItemLabel.TITLE: "title",
+    DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
+    DocItemLabel.SECTION_HEADER: "subtitle-level-1",
+    DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
+    DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
+    DocItemLabel.CAPTION: "caption",
+    DocItemLabel.PAGE_HEADER: "page-header",
+    DocItemLabel.PAGE_FOOTER: "page-footer",
+    DocItemLabel.FOOTNOTE: "footnote",
+    DocItemLabel.TABLE: "table",
+    DocItemLabel.FORMULA: "equation",
+    DocItemLabel.LIST_ITEM: "paragraph",
+    DocItemLabel.CODE: "paragraph",
+    DocItemLabel.PICTURE: "figure",
+    DocItemLabel.TEXT: "paragraph",
+    DocItemLabel.PARAGRAPH: "paragraph",
 }

-_EMPTY_DOC = DsDocument(
-    _name="",
-    description=DsDocumentDescription(logs=[]),
-    file_info=DsFileInfoObject(
-        filename="",
-        document_hash="",
-    ),
-)
+_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")


 class InputDocument(BaseModel):
-    file: PurePath = None
-    document_hash: Optional[str] = None
-    valid: bool = False
+    file: PurePath
+    document_hash: str  # = None
+    valid: bool = True
    limits: DocumentLimits = DocumentLimits()
+    format: InputFormat  # = None

    filesize: Optional[int] = None
-    page_count: Optional[int] = None
+    page_count: int = 0

-    _backend: PdfDocumentBackend = None  # Internal PDF backend used
+    _backend: AbstractDocumentBackend  # Internal PDF backend used

    def __init__(
        self,
        path_or_stream: Union[BytesIO, Path],
+        format: InputFormat,
+        backend: Type[AbstractDocumentBackend],
        filename: Optional[str] = None,
        limits: Optional[DocumentLimits] = None,
-        pdf_backend=DoclingParseDocumentBackend,
    ):
-        super().__init__()
+        super().__init__(
+            file="", document_hash="", format=InputFormat.PDF
+        )  # initialize with dummy values

        self.limits = limits or DocumentLimits()
+        self.format = format

        try:
            if isinstance(path_or_stream, Path):
@@ -91,11 +105,12 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(
-                        path_or_stream=path_or_stream, document_hash=self.document_hash
-                    )
+                    self._init_doc(backend, path_or_stream)

            elif isinstance(path_or_stream, BytesIO):
+                assert (
+                    filename is not None
+                ), "Can't construct InputDocument from stream without providing filename arg."
                self.file = PurePath(filename)
                self.filesize = path_or_stream.getbuffer().nbytes

@@ -103,15 +118,20 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(
-                        path_or_stream=path_or_stream, document_hash=self.document_hash
-                    )
+                    self._init_doc(backend, path_or_stream)
+            else:
+                raise RuntimeError(
+                    f"Unexpected type path_or_stream: {type(path_or_stream)}"
+                )

-            if self.document_hash and self._backend.page_count() > 0:
-                self.page_count = self._backend.page_count()
-
-                if self.page_count <= self.limits.max_num_pages:
-                    self.valid = True
+            # For paginated backends, check if the maximum page count is exceeded.
+            if self.valid and self._backend.is_valid():
+                if self._backend.supports_pagination() and isinstance(
+                    self._backend, PaginatedDocumentBackend
+                ):
+                    self.page_count = self._backend.page_count()
+                    if not self.page_count <= self.limits.max_num_pages:
+                        self.valid = False

        except (FileNotFoundError, OSError) as e:
            _log.exception(
@@ -125,9 +145,26 @@ class InputDocument(BaseModel):
            )
            # raise

+    def _init_doc(
+        self,
+        backend: Type[AbstractDocumentBackend],
+        path_or_stream: Union[BytesIO, Path],
+    ) -> None:
+        if backend is None:
+            raise RuntimeError(
+                f"No backend configuration provided for file {self.file.name} with format {self.format}. "
+                f"Please check your format configuration on DocumentConverter."
+            )

-@deprecated("Use `ConversionResult` instead.")
-class ConvertedDocument(BaseModel):
+        self._backend = backend(self, path_or_stream=path_or_stream)
+
+
+class DocumentFormat(str, Enum):
+    V2 = "v2"
+    V1 = "v1"
+
+
+class ConversionResult(BaseModel):
    input: InputDocument

    status: ConversionStatus = ConversionStatus.PENDING  # failure, success
@@ -136,15 +173,42 @@ class ConvertedDocument(BaseModel):
    pages: List[Page] = []
    assembled: AssembledUnit = AssembledUnit()

-    output: DsDocument = _EMPTY_DOC
+    document: DoclingDocument = _EMPTY_DOCLING_DOC
+
+    @property
+    @deprecated("Use document instead.")
+    def legacy_document(self):
+        reverse_label_mapping = {
+            DocItemLabel.CAPTION.value: "Caption",
+            DocItemLabel.FOOTNOTE.value: "Footnote",
+            DocItemLabel.FORMULA.value: "Formula",
+            DocItemLabel.LIST_ITEM.value: "List-item",
+            DocItemLabel.PAGE_FOOTER.value: "Page-footer",
+            DocItemLabel.PAGE_HEADER.value: "Page-header",
+            DocItemLabel.PICTURE.value: "Picture",  # low threshold adjust to capture chemical structures for examples.
+            DocItemLabel.SECTION_HEADER.value: "Section-header",
+            DocItemLabel.TABLE.value: "Table",
+            DocItemLabel.TEXT.value: "Text",
+            DocItemLabel.TITLE.value: "Title",
+            DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
+            DocItemLabel.CODE.value: "Code",
+            DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
+            DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
+            DocItemLabel.FORM.value: "Form",
+            DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
+            DocItemLabel.PARAGRAPH.value: "paragraph",
+        }

-    def _to_ds_document(self) -> DsDocument:
        title = ""
        desc = DsDocumentDescription(logs=[])

        page_hashes = [
-            PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
-            for p in self.pages
+            PageReference(
+                hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
+                page=p.page_no,
+                model="default",
+            )
+            for p in self.document.pages.values()
        ]

        file_info = DsFileInfoObject(
@@ -157,145 +221,199 @@ class ConvertedDocument(BaseModel):
        main_text = []
        tables = []
        figures = []
+        equations = []
+        footnotes = []
+        page_headers = []
+        page_footers = []

-        page_no_to_page = {p.page_no: p for p in self.pages}
+        embedded_captions = set()
+        for ix, (item, level) in enumerate(
+            self.document.iterate_items(self.document.body)
+        ):

-        for element in self.assembled.elements:
-            # Convert bboxes to lower-left origin.
-            target_bbox = DsBoundingBox(
-                element.cluster.bbox.to_bottom_left_origin(
-                    page_no_to_page[element.page_no].size.height
-                ).as_tuple()
-            )
+            if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
+                caption = item.caption_text(self.document)
+                if caption:
+                    embedded_captions.add(caption)

-            if isinstance(element, TextElement):
-                main_text.append(
-                    BaseText(
-                        text=element.text,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        name=element.label,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, len(element.text)],
-                            )
-                        ],
-                    )
-                )
-            elif isinstance(element, TableElement):
-                index = len(tables)
-                ref_str = f"#/tables/{index}"
-                main_text.append(
-                    Ref(
-                        name=element.label,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        ref=ref_str,
-                    ),
-                )
+        for item, level in self.document.iterate_items():
+            if isinstance(item, DocItem):
+                item_type = item.label

-                # Initialise empty table data grid (only empty cells)
-                table_data = [
-                    [
-                        TableCell(
-                            text="",
-                            # bbox=[0,0,0,0],
-                            spans=[[i, j]],
-                            obj_type="body",
+                if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
+
+                    if isinstance(item, ListItem) and item.marker:
+                        text = f"{item.marker} {item.text}"
+                    else:
+                        text = item.text
+
+                    # Can be empty.
+                    prov = [
+                        Prov(
+                            bbox=p.bbox.as_tuple(),
+                            page=p.page_no,
+                            span=[0, len(item.text)],
                        )
-                        for j in range(element.num_cols)
+                        for p in item.prov
                    ]
-                    for i in range(element.num_rows)
-                ]
+                    main_text.append(
+                        BaseText(
+                            text=text,
+                            obj_type=layout_label_to_ds_type.get(item.label),
+                            name=reverse_label_mapping[item.label],
+                            prov=prov,
+                        )
+                    )

-                # Overwrite cells in table data for which there is actual cell content.
-                for cell in element.table_cells:
-                    for i in range(
-                        min(cell.start_row_offset_idx, element.num_rows),
-                        min(cell.end_row_offset_idx, element.num_rows),
-                    ):
-                        for j in range(
-                            min(cell.start_col_offset_idx, element.num_cols),
-                            min(cell.end_col_offset_idx, element.num_cols),
+                    # skip captions of they are embedded in the actual
+                    # floating object
+                    if item_type == DocItemLabel.CAPTION and text in embedded_captions:
+                        continue
+
+                elif isinstance(item, TableItem) and item.data:
+                    index = len(tables)
+                    ref_str = f"#/tables/{index}"
+                    main_text.append(
+                        Ref(
+                            name=reverse_label_mapping[item.label],
+                            obj_type=layout_label_to_ds_type.get(item.label),
+                            ref=ref_str,
+                        ),
+                    )
+
+                    # Initialise empty table data grid (only empty cells)
+                    table_data = [
+                        [
+                            TableCell(
+                                text="",
+                                # bbox=[0,0,0,0],
+                                spans=[[i, j]],
+                                obj_type="body",
+                            )
+                            for j in range(item.data.num_cols)
+                        ]
+                        for i in range(item.data.num_rows)
+                    ]
+
+                    # Overwrite cells in table data for which there is actual cell content.
+                    for cell in item.data.table_cells:
+                        for i in range(
+                            min(cell.start_row_offset_idx, item.data.num_rows),
+                            min(cell.end_row_offset_idx, item.data.num_rows),
                        ):
-                            celltype = "body"
-                            if cell.column_header:
-                                celltype = "col_header"
-                            elif cell.row_header:
-                                celltype = "row_header"
-                            elif cell.row_section:
-                                celltype = "row_section"
+                            for j in range(
+                                min(cell.start_col_offset_idx, item.data.num_cols),
+                                min(cell.end_col_offset_idx, item.data.num_cols),
+                            ):
+                                celltype = "body"
+                                if cell.column_header:
+                                    celltype = "col_header"
+                                elif cell.row_header:
+                                    celltype = "row_header"
+                                elif cell.row_section:
+                                    celltype = "row_section"

-                            def make_spans(cell):
-                                for rspan in range(
-                                    min(cell.start_row_offset_idx, element.num_rows),
-                                    min(cell.end_row_offset_idx, element.num_rows),
-                                ):
-                                    for cspan in range(
+                                def make_spans(cell):
+                                    for rspan in range(
                                        min(
-                                            cell.start_col_offset_idx, element.num_cols
+                                            cell.start_row_offset_idx,
+                                            item.data.num_rows,
+                                        ),
+                                        min(
+                                            cell.end_row_offset_idx, item.data.num_rows
                                        ),
-                                        min(cell.end_col_offset_idx, element.num_cols),
                                    ):
-                                        yield [rspan, cspan]
+                                        for cspan in range(
+                                            min(
+                                                cell.start_col_offset_idx,
+                                                item.data.num_cols,
+                                            ),
+                                            min(
+                                                cell.end_col_offset_idx,
+                                                item.data.num_cols,
+                                            ),
+                                        ):
+                                            yield [rspan, cspan]

-                            spans = list(make_spans(cell))
-                            table_data[i][j] = TableCell(
-                                text=cell.text,
-                                bbox=cell.bbox.to_bottom_left_origin(
-                                    page_no_to_page[element.page_no].size.height
-                                ).as_tuple(),
-                                # col=j,
-                                # row=i,
-                                spans=spans,
-                                obj_type=celltype,
-                                # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
-                                # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
-                            )
+                                spans = list(make_spans(cell))
+                                table_data[i][j] = GlmTableCell(
+                                    text=cell.text,
+                                    bbox=(
+                                        cell.bbox.as_tuple()
+                                        if cell.bbox is not None
+                                        else None
+                                    ),  # check if this is bottom-left
+                                    spans=spans,
+                                    obj_type=celltype,
+                                    col=j,
+                                    row=i,
+                                    row_header=cell.row_header,
+                                    row_section=cell.row_section,
+                                    col_header=cell.column_header,
+                                    row_span=[
+                                        cell.start_row_offset_idx,
+                                        cell.end_row_offset_idx,
+                                    ],
+                                    col_span=[
+                                        cell.start_col_offset_idx,
+                                        cell.end_col_offset_idx,
+                                    ],
+                                )

-                tables.append(
-                    DsSchemaTable(
-                        num_cols=element.num_cols,
-                        num_rows=element.num_rows,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        data=table_data,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
+                    # Compute the caption
+                    caption = item.caption_text(self.document)
+
+                    tables.append(
+                        DsSchemaTable(
+                            text=caption,
+                            num_cols=item.data.num_cols,
+                            num_rows=item.data.num_rows,
+                            obj_type=layout_label_to_ds_type.get(item.label),
+                            data=table_data,
+                            prov=[
+                                Prov(
+                                    bbox=p.bbox.as_tuple(),
+                                    page=p.page_no,
+                                    span=[0, 0],
+                                )
+                                for p in item.prov
+                            ],
+                        )
                    )
-                )

-            elif isinstance(element, FigureElement):
-                index = len(figures)
-                ref_str = f"#/figures/{index}"
-                main_text.append(
-                    Ref(
-                        name=element.label,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        ref=ref_str,
-                    ),
-                )
-                figures.append(
-                    Figure(
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        # data=[[]],
+                elif isinstance(item, PictureItem):
+                    index = len(figures)
+                    ref_str = f"#/figures/{index}"
+                    main_text.append(
+                        Ref(
+                            name=reverse_label_mapping[item.label],
+                            obj_type=layout_label_to_ds_type.get(item.label),
+                            ref=ref_str,
+                        ),
+                    )
+
+                    # Compute the caption
+                    caption = item.caption_text(self.document)
+
+                    figures.append(
+                        Figure(
+                            prov=[
+                                Prov(
+                                    bbox=p.bbox.as_tuple(),
+                                    page=p.page_no,
+                                    span=[0, len(caption)],
+                                )
+                                for p in item.prov
+                            ],
+                            obj_type=layout_label_to_ds_type.get(item.label),
+                            text=caption,
+                            # data=[[]],
+                        )
                    )
-                )

        page_dimensions = [
-            PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
-            for p in self.pages
+            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
+            for p in self.document.pages.values()
        ]

        ds_doc = DsDocument(
@@ -303,6 +421,10 @@ class ConvertedDocument(BaseModel):
            description=desc,
            file_info=file_info,
            main_text=main_text,
+            equations=equations,
+            footnotes=footnotes,
+            page_headers=page_headers,
+            page_footers=page_footers,
            tables=tables,
            figures=figures,
            page_dimensions=page_dimensions,
@@ -310,152 +432,76 @@ class ConvertedDocument(BaseModel):

        return ds_doc

-    def render_as_dict(self):
-        return self.output.model_dump(by_alias=True, exclude_none=True)

-    def render_as_markdown(
-        self,
-        delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-            "figure",
-        ],
-        strict_text: bool = False,
-        image_placeholder: str = "<!-- image -->",
-    ):
-        return self.output.export_to_markdown(
-            delim=delim,
-            main_text_start=main_text_start,
-            main_text_stop=main_text_stop,
-            main_text_labels=main_text_labels,
-            strict_text=strict_text,
-            image_placeholder=image_placeholder,
-        )
+class _DocumentConversionInput(BaseModel):

-    def render_as_text(
-        self,
-        delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-        ],
-    ):
-        return self.output.export_to_markdown(
-            delim=delim,
-            main_text_start=main_text_start,
-            main_text_stop=main_text_stop,
-            main_text_labels=main_text_labels,
-            strict_text=True,
-        )
-
-    def render_as_doctags(
-        self,
-        delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-            "figure",
-        ],
-        xsize: int = 100,
-        ysize: int = 100,
-        add_location: bool = True,
-        add_content: bool = True,
-        add_page_index: bool = True,
-        # table specific flags
-        add_table_cell_location: bool = False,
-        add_table_cell_label: bool = True,
-        add_table_cell_text: bool = True,
-    ) -> str:
-        return self.output.export_to_document_tokens(
-            delim=delim,
-            main_text_start=main_text_start,
-            main_text_stop=main_text_stop,
-            main_text_labels=main_text_labels,
-            xsize=xsize,
-            ysize=ysize,
-            add_location=add_location,
-            add_content=add_content,
-            add_page_index=add_page_index,
-            # table specific flags
-            add_table_cell_location=add_table_cell_location,
-            add_table_cell_label=add_table_cell_label,
-            add_table_cell_text=add_table_cell_text,
-        )
-
-    def render_element_images(
-        self, element_types: Tuple[PageElement] = (FigureElement,)
-    ):
-        for element in self.assembled.elements:
-            if isinstance(element, element_types):
-                page_ix = element.page_no
-                scale = self.pages[page_ix]._default_image_scale
-                crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
-                    page_height=self.pages[page_ix].size.height * scale
-                )
-
-                cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
-                yield element, cropped_im
-
-
-class ConversionResult(ConvertedDocument):
-    pass
-
-
-class DocumentConversionInput(BaseModel):
-
-    _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
+    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
    limits: Optional[DocumentLimits] = DocumentLimits()

-    DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
-
    def docs(
-        self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
+        self, format_options: Dict[InputFormat, "FormatOption"]
    ) -> Iterable[InputDocument]:
+        for item in self.path_or_stream_iterator:
+            obj = resolve_file_source(item) if isinstance(item, str) else item
+            format = self._guess_format(obj)
+            if format not in format_options.keys():
+                _log.info(
+                    f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
+                )
+                continue
+            else:
+                backend = format_options[format].backend

-        pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
-
-        for obj in self._path_or_stream_iterator:
            if isinstance(obj, Path):
                yield InputDocument(
-                    path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
+                    path_or_stream=obj,
+                    format=format,
+                    filename=obj.name,
+                    limits=self.limits,
+                    backend=backend,
                )
            elif isinstance(obj, DocumentStream):
                yield InputDocument(
                    path_or_stream=obj.stream,
-                    filename=obj.filename,
+                    format=format,
+                    filename=obj.name,
                    limits=self.limits,
-                    pdf_backend=pdf_backend,
+                    backend=backend,
                )
+            else:
+                raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")

-    @classmethod
-    def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
-        paths = [Path(p) for p in paths]
+    def _guess_format(self, obj):
+        content = None
+        if isinstance(obj, Path):
+            mime = filetype.guess_mime(str(obj))
+            if mime is None:
+                with obj.open("rb") as f:
+                    content = f.read(1024)  # Read first 1KB

-        doc_input = cls(limits=limits)
-        doc_input._path_or_stream_iterator = paths
+        elif isinstance(obj, DocumentStream):
+            obj.stream.seek(0)
+            content = obj.stream.read(8192)
+            obj.stream.seek(0)
+            mime = filetype.guess_mime(content)

-        return doc_input
+        if mime is None:
+            mime = self._detect_html_xhtml(content)

-    @classmethod
-    def from_streams(
-        cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
-    ):
-        doc_input = cls(limits=limits)
-        doc_input._path_or_stream_iterator = streams
+        format = MimeTypeToFormat.get(mime)
+        return format

-        return doc_input
+    def _detect_html_xhtml(self, content):
+        content_str = content.decode("ascii", errors="ignore").lower()
+        # Remove XML comments
+        content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
+        content_str = content_str.lstrip()
+
+        if re.match(r"<\?xml", content_str):
+            if "xhtml" in content_str[:1000]:
+                return "application/xhtml+xml"
+
+        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
+            return "text/html"
+
+        return None
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -1,4 +1,5 @@
 from enum import Enum, auto
+from pathlib import Path
 from typing import List, Literal, Optional, Union

 from pydantic import BaseModel, ConfigDict, Field
@@ -58,6 +59,13 @@ class TesseractOcrOptions(OcrOptions):


 class PipelineOptions(BaseModel):
+    create_legacy_output: bool = (
+        True  # This defautl will be set to False on a future version of docling
+    )
+
+
+class PdfPipelineOptions(PipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text

@@ -65,3 +73,8 @@ class PipelineOptions(BaseModel):
    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
        Field(EasyOcrOptions(), discriminator="kind")
    )
+
+    images_scale: float = 1.0
+    generate_page_images: bool = False
+    generate_picture_images: bool = False
+    generate_table_images: bool = False
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@@ -14,6 +14,7 @@ class BatchConcurrencySettings(BaseModel):
    doc_batch_concurrency: int = 2
    page_batch_size: int = 4
    page_batch_concurrency: int = 2
+    elements_batch_size: int = 16

    # doc_batch_size: int = 1
    # doc_batch_concurrency: int = 1
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -1,84 +1,179 @@
-import functools
 import logging
-import tempfile
+import sys
 import time
-import traceback
+from functools import partial
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Type

-import requests
-from PIL import ImageDraw
-from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
+from pydantic import BaseModel, ConfigDict, model_validator, validate_call

-from docling.backend.abstract_backend import PdfDocumentBackend
-from docling.datamodel.base_models import (
-    AssembledUnit,
-    AssembleOptions,
-    ConversionStatus,
-    DoclingComponentType,
-    ErrorItem,
-    Page,
-)
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
+from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
 from docling.datamodel.document import (
    ConversionResult,
-    DocumentConversionInput,
    InputDocument,
+    _DocumentConversionInput,
 )
 from docling.datamodel.pipeline_options import PipelineOptions
-from docling.datamodel.settings import settings
-from docling.models.ds_glm_model import GlmModel
-from docling.models.page_assemble_model import PageAssembleModel
-from docling.pipeline.base_model_pipeline import BaseModelPipeline
-from docling.pipeline.standard_model_pipeline import StandardModelPipeline
-from docling.utils.utils import chunkify, create_hash
+from docling.datamodel.settings import DocumentLimits, settings
+from docling.pipeline.base_pipeline import BasePipeline
+from docling.pipeline.simple_pipeline import SimplePipeline
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.utils.utils import chunkify

 _log = logging.getLogger(__name__)


+class FormatOption(BaseModel):
+    pipeline_cls: Type[BasePipeline]
+    pipeline_options: Optional[PipelineOptions] = None
+    backend: Type[AbstractDocumentBackend]
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    @model_validator(mode="after")
+    def set_optional_field_default(self) -> "FormatOption":
+        if self.pipeline_options is None:
+            self.pipeline_options = self.pipeline_cls.get_default_options()
+        return self
+
+
+class WordFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
+
+
+class PowerpointFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
+
+
+class HTMLFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
+
+
+class PdfFormatOption(FormatOption):
+    pipeline_cls: Type = StandardPdfPipeline
+    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+
+
+class ImageFormatOption(FormatOption):
+    pipeline_cls: Type = StandardPdfPipeline
+    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+
+
+_format_to_default_options = {
+    InputFormat.DOCX: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
+    ),
+    InputFormat.PPTX: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
+    ),
+    InputFormat.HTML: FormatOption(
+        pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
+    ),
+    InputFormat.IMAGE: FormatOption(
+        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+    ),
+    InputFormat.PDF: FormatOption(
+        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+    ),
+}
+
+
 class DocumentConverter:
-    _default_download_filename = "file.pdf"
+    _default_download_filename = "file"

    def __init__(
        self,
-        artifacts_path: Optional[Union[Path, str]] = None,
-        pipeline_options: PipelineOptions = PipelineOptions(),
-        pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
-        pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
-        assemble_options: AssembleOptions = AssembleOptions(),
+        allowed_formats: Optional[List[InputFormat]] = None,
+        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
    ):
-        if not artifacts_path:
-            artifacts_path = self.download_models_hf()
+        self.allowed_formats = allowed_formats
+        self.format_to_options = format_options

-        artifacts_path = Path(artifacts_path)
+        if self.allowed_formats is None:
+            # if self.format_to_options is not None:
+            #    self.allowed_formats = self.format_to_options.keys()
+            # else:
+            self.allowed_formats = [e for e in InputFormat]  # all formats

-        self.model_pipeline = pipeline_cls(
-            artifacts_path=artifacts_path, pipeline_options=pipeline_options
+        if self.format_to_options is None:
+            self.format_to_options = _format_to_default_options
+        else:
+            for f in self.allowed_formats:
+                if f not in self.format_to_options.keys():
+                    _log.debug(f"Requested format {f} will use default options.")
+                    self.format_to_options[f] = _format_to_default_options[f]
+
+            remove_keys = []
+            for f in self.format_to_options.keys():
+                if f not in self.allowed_formats:
+                    remove_keys.append(f)
+
+            for f in remove_keys:
+                self.format_to_options.pop(f)
+
+        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
+
+    @validate_call(config=ConfigDict(strict=True))
+    def convert(
+        self,
+        source: Path | str | DocumentStream,  # TODO review naming
+        raises_on_error: bool = True,
+        max_num_pages: int = sys.maxsize,
+        max_file_size: int = sys.maxsize,
+    ) -> ConversionResult:
+
+        all_res = self.convert_all(
+            source=[source],
+            raises_on_error=raises_on_error,
+            max_num_pages=max_num_pages,
+            max_file_size=max_file_size,
        )
+        return next(all_res)

-        self.page_assemble_model = PageAssembleModel(config={})
-        self.glm_model = GlmModel(config={})
-        self.pdf_backend = pdf_backend
-        self.assemble_options = assemble_options
-
-    @staticmethod
-    def download_models_hf(
-        local_dir: Optional[Path] = None, force: bool = False
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-
-        download_path = snapshot_download(
-            repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
-            revision="v2.0.0",
+    @validate_call(config=ConfigDict(strict=True))
+    def convert_all(
+        self,
+        source: Iterable[Path | str | DocumentStream],  # TODO review naming
+        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
+        max_num_pages: int = sys.maxsize,
+        max_file_size: int = sys.maxsize,
+    ) -> Iterator[ConversionResult]:
+        limits = DocumentLimits(
+            max_num_pages=max_num_pages,
+            max_file_size=max_file_size,
        )
+        conv_input = _DocumentConversionInput(
+            path_or_stream_iterator=source,
+            limit=limits,
+        )
+        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
+        for conv_res in conv_res_iter:
+            if raises_on_error and conv_res.status not in {
+                ConversionStatus.SUCCESS,
+                ConversionStatus.PARTIAL_SUCCESS,
+            }:
+                raise RuntimeError(
+                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
+                )
+            else:
+                yield conv_res

-        return Path(download_path)
-
-    def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
+    def _convert(
+        self, conv_input: _DocumentConversionInput, raises_on_error: bool
+    ) -> Iterator[ConversionResult]:
+        assert self.format_to_options is not None

        for input_batch in chunkify(
-            input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
+            conv_input.docs(self.format_to_options),
+            settings.perf.doc_batch_size,  # pass format_options
        ):
            _log.info(f"Going to convert document batch...")
            # parallel processing only within input_batch
@@ -87,211 +182,79 @@ class DocumentConverter:
            # ) as pool:
            #   yield from pool.map(self.process_document, input_batch)

-            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
-            yield from map(self._process_document, input_batch)
+            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
+            for item in map(
+                partial(self._process_document, raises_on_error=raises_on_error),
+                input_batch,
+            ):
+                if item is not None:
+                    yield item

-    def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
-        """Convert a single document.
+    def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
+        assert self.format_to_options is not None

-        Args:
-            source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
+        fopt = self.format_to_options.get(doc.format)

-        Raises:
-            ValueError: If source is of unexpected type.
-            RuntimeError: If conversion fails.
+        if fopt is None:
+            raise RuntimeError(f"Could not get pipeline for document {doc.file}")
+        else:
+            pipeline_class = fopt.pipeline_cls
+            pipeline_options = fopt.pipeline_options

-        Returns:
-            ConversionResult: The conversion result object.
-        """
-        with tempfile.TemporaryDirectory() as temp_dir:
-            try:
-                http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
-                res = requests.get(http_url, stream=True)
-                res.raise_for_status()
-                fname = None
-                # try to get filename from response header
-                if cont_disp := res.headers.get("Content-Disposition"):
-                    for par in cont_disp.strip().split(";"):
-                        # currently only handling directive "filename" (not "*filename")
-                        if (split := par.split("=")) and split[0].strip() == "filename":
-                            fname = "=".join(split[1:]).strip().strip("'\"") or None
-                            break
-                # otherwise, use name from URL:
-                if fname is None:
-                    fname = Path(http_url.path).name or self._default_download_filename
-                local_path = Path(temp_dir) / fname
-                with open(local_path, "wb") as f:
-                    for chunk in res.iter_content(chunk_size=1024):  # using 1-KB chunks
-                        f.write(chunk)
-            except ValidationError:
-                try:
-                    local_path = TypeAdapter(Path).validate_python(source)
-                except ValidationError:
-                    raise ValueError(
-                        f"Unexpected file path type encountered: {type(source)}"
-                    )
-            conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
-            conv_res_iter = self.convert(conv_inp)
-            conv_res: ConversionResult = next(conv_res_iter)
-        if conv_res.status not in {
-            ConversionStatus.SUCCESS,
-            ConversionStatus.PARTIAL_SUCCESS,
-        }:
-            raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
-        return conv_res
-
-    def _process_document(self, in_doc: InputDocument) -> ConversionResult:
-        start_doc_time = time.time()
-        conv_res = ConversionResult(input=in_doc)
-
-        _log.info(f"Processing document {in_doc.file.name}")
-
-        if not in_doc.valid:
-            conv_res.status = ConversionStatus.FAILURE
-            return conv_res
-
-        for i in range(0, in_doc.page_count):
-            conv_res.pages.append(Page(page_no=i))
-
-        all_assembled_pages = []
-
-        try:
-            # Iterate batches of pages (page_batch_size) in the doc
-            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
-                start_pb_time = time.time()
-                # Pipeline
-
-                # 1. Initialise the page resources
-                init_pages = map(
-                    functools.partial(self._initialize_page, in_doc), page_batch
-                )
-
-                # 2. Populate page image
-                pages_with_images = map(
-                    functools.partial(self._populate_page_images, in_doc), init_pages
-                )
-
-                # 3. Populate programmatic page cells
-                pages_with_cells = map(
-                    functools.partial(self._parse_page_cells, in_doc),
-                    pages_with_images,
-                )
-
-                # 4. Run pipeline stages
-                pipeline_pages = self.model_pipeline.apply(pages_with_cells)
-
-                # 5. Assemble page elements (per page)
-                assembled_pages = self.page_assemble_model(pipeline_pages)
-
-                # exhaust assembled_pages
-                for assembled_page in assembled_pages:
-                    # Free up mem resources before moving on with next batch
-
-                    # Remove page images (can be disabled)
-                    if self.assemble_options.images_scale is None:
-                        assembled_page._image_cache = {}
-
-                    # Unload backend
-                    assembled_page._backend.unload()
-
-                    all_assembled_pages.append(assembled_page)
-
-                end_pb_time = time.time() - start_pb_time
-                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
-
-            conv_res.pages = all_assembled_pages
-            self._assemble_doc(conv_res)
-
-            status = ConversionStatus.SUCCESS
-            for page in conv_res.pages:
-                if not page._backend.is_valid():
-                    conv_res.errors.append(
-                        ErrorItem(
-                            component_type=DoclingComponentType.PDF_BACKEND,
-                            module_name=type(page._backend).__name__,
-                            error_message=f"Page {page.page_no} failed to parse.",
-                        )
-                    )
-                    status = ConversionStatus.PARTIAL_SUCCESS
-
-            conv_res.status = status
-
-        except Exception as e:
-            conv_res.status = ConversionStatus.FAILURE
-            trace = "\n".join(traceback.format_exception(e))
-            _log.info(
-                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
-                f"{trace}"
+        assert pipeline_options is not None
+        # TODO this will ignore if different options have been defined for the same pipeline class.
+        if (
+            pipeline_class not in self.initialized_pipelines
+            or self.initialized_pipelines[pipeline_class].pipeline_options
+            != pipeline_options
+        ):
+            self.initialized_pipelines[pipeline_class] = pipeline_class(
+                pipeline_options=pipeline_options
            )
+        return self.initialized_pipelines[pipeline_class]

-        finally:
-            # Always unload the PDF backend, even in case of failure
-            if in_doc._backend:
-                in_doc._backend.unload()
+    def _process_document(
+        self, in_doc: InputDocument, raises_on_error: bool
+    ) -> Optional[ConversionResult]:
+        assert self.allowed_formats is not None
+        assert in_doc.format in self.allowed_formats
+
+        start_doc_time = time.time()
+
+        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)

        end_doc_time = time.time() - start_doc_time
        _log.info(
-            f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
+            f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
        )

        return conv_res

-    # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
-    def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
-        page._backend = doc._backend.load_page(page.page_no)
-        page.size = page._backend.get_size()
-        page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
+    def _execute_pipeline(
+        self, in_doc: InputDocument, raises_on_error: bool
+    ) -> ConversionResult:
+        if in_doc.valid:
+            pipeline = self._get_pipeline(in_doc)
+            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+                if raises_on_error:
+                    raise RuntimeError(
+                        f"No pipeline could be initialized for {in_doc.file}."
+                    )
+                else:
+                    conv_res = ConversionResult(input=in_doc)
+                    conv_res.status = ConversionStatus.FAILURE
+                    return conv_res

-        return page
+            conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)

-    # Generate the page image and store it in the page object
-    def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
-        # default scale
-        page.get_image(
-            scale=1.0
-        )  # puts the page image on the image cache at default scale
+        else:
+            if raises_on_error:
+                raise RuntimeError(f"Input document {in_doc.file} is not valid.")

-        # user requested scales
-        if self.assemble_options.images_scale is not None:
-            page._default_image_scale = self.assemble_options.images_scale
-            page.get_image(
-                scale=self.assemble_options.images_scale
-            )  # this will trigger storing the image in the internal cache
+            else:
+                # invalid doc or not of desired format
+                conv_res = ConversionResult(input=in_doc)
+                conv_res.status = ConversionStatus.FAILURE
+                # TODO add error log why it failed.

-        return page
-
-    # Extract and populate the page cells and store it in the page object
-    def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
-        page.cells = page._backend.get_text_cells()
-
-        # DEBUG code:
-        def draw_text_boxes(image, cells):
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
-            image.show()
-
-        # draw_text_boxes(page.get_image(scale=1.0), cells)
-
-        return page
-
-    def _assemble_doc(self, conv_res: ConversionResult):
-        all_elements = []
-        all_headers = []
-        all_body = []
-
-        for p in conv_res.pages:
-
-            for el in p.assembled.body:
-                all_body.append(el)
-            for el in p.assembled.headers:
-                all_headers.append(el)
-            for el in p.assembled.elements:
-                all_elements.append(el)
-
-        conv_res.assembled = AssembledUnit(
-            elements=all_elements, headers=all_headers, body=all_body
-        )
-
-        conv_res.output = self.glm_model(conv_res)
+        return conv_res
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@@ -0,0 +1,25 @@
+from abc import ABC, abstractmethod
+from typing import Any, Iterable
+
+from docling_core.types.doc import DoclingDocument, NodeItem
+
+from docling.datamodel.base_models import Page
+
+
+class BasePageModel(ABC):
+    @abstractmethod
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        pass
+
+
+class BaseEnrichmentModel(ABC):
+
+    @abstractmethod
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        pass
+
+    @abstractmethod
+    def __call__(
+        self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
+    ) -> Iterable[Any]:
+        pass
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -1,14 +1,15 @@
 import copy
 import logging
 from abc import abstractmethod
-from typing import Iterable, List, Tuple
+from typing import Iterable, List

 import numpy as np
+from docling_core.types.doc import BoundingBox, CoordOrigin
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import find_objects, label

-from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.pipeline_options import OcrOptions

 _log = logging.getLogger(__name__)
@@ -20,8 +21,9 @@ class BaseOcrModel:
        self.options = options

    # Computes the optimum amount and coordinates of rectangles to OCR on a given page
-    def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
+    def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
        BITMAP_COVERAGE_TRESHOLD = 0.75
+        assert page.size is not None

        def find_ocr_rects(size, bitmap_rects):
            image = Image.new(
@@ -60,7 +62,10 @@ class BaseOcrModel:

            return (area_frac, bounding_boxes)  # fraction covered  # boxes

-        bitmap_rects = page._backend.get_bitmap_rects()
+        if page._backend is not None:
+            bitmap_rects = page._backend.get_bitmap_rects()
+        else:
+            bitmap_rects = []
        coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)

        # return full-page rectangle if sufficiently covered with bitmaps
@@ -75,7 +80,7 @@ class BaseOcrModel:
                )
            ]
        # return individual rectangles if the bitmap coverage is smaller
-        elif coverage < BITMAP_COVERAGE_TRESHOLD:
+        else:  # coverage <= BITMAP_COVERAGE_TRESHOLD:
            return ocr_rects

    # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@@ -1,39 +1,228 @@
 import copy
 import random
+from typing import List, Union

 from deepsearch_glm.nlp_utils import init_nlp_model
-from deepsearch_glm.utils.doc_utils import to_legacy_document_format
+from deepsearch_glm.utils.doc_utils import to_docling_document
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
 from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
-from docling_core.types import Ref
+from docling_core.types import DocumentDescription as DsDocumentDescription
+from docling_core.types import FileInfoObject as DsFileInfoObject
+from docling_core.types import PageDimensions, PageReference, Prov, Ref
+from docling_core.types import Table as DsSchemaTable
+from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
+from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
+from docling_core.types.legacy_doc.base import Figure, TableCell
 from PIL import ImageDraw
+from pydantic import BaseModel, ConfigDict

-from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
-from docling.datamodel.document import ConversionResult
+from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
+from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
+from docling.utils.utils import create_hash
+
+
+class GlmOptions(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+
+    model_names: str = ""  # e.g. "language;term;reference"


 class GlmModel:
-    def __init__(self, config):
-        self.config = config
-        self.model_names = self.config.get(
-            "model_names", ""
-        )  # "language;term;reference"
-        load_pretrained_nlp_models()
-        # model = init_nlp_model(model_names="language;term;reference")
-        model = init_nlp_model(model_names=self.model_names)
-        self.model = model
+    def __init__(self, options: GlmOptions):
+        self.options = options

-    def __call__(self, conv_res: ConversionResult) -> DsDocument:
-        ds_doc = conv_res._to_ds_document()
+        load_pretrained_nlp_models()
+        self.model = init_nlp_model(model_names=self.options.model_names)
+
+    def _to_legacy_document(self, conv_res) -> DsDocument:
+        title = ""
+        desc: DsDocumentDescription = DsDocumentDescription(logs=[])
+
+        page_hashes = [
+            PageReference(
+                hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
+                page=p.page_no + 1,
+                model="default",
+            )
+            for p in conv_res.pages
+        ]
+
+        file_info = DsFileInfoObject(
+            filename=conv_res.input.file.name,
+            document_hash=conv_res.input.document_hash,
+            num_pages=conv_res.input.page_count,
+            page_hashes=page_hashes,
+        )
+
+        main_text: List[Union[Ref, BaseText]] = []
+        tables: List[DsSchemaTable] = []
+        figures: List[Figure] = []
+
+        page_no_to_page = {p.page_no: p for p in conv_res.pages}
+
+        for element in conv_res.assembled.elements:
+            # Convert bboxes to lower-left origin.
+            target_bbox = DsBoundingBox(
+                element.cluster.bbox.to_bottom_left_origin(
+                    page_no_to_page[element.page_no].size.height
+                ).as_tuple()
+            )
+
+            if isinstance(element, TextElement):
+                main_text.append(
+                    BaseText(
+                        text=element.text,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        name=element.label,
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no + 1,
+                                span=[0, len(element.text)],
+                            )
+                        ],
+                    )
+                )
+            elif isinstance(element, Table):
+                index = len(tables)
+                ref_str = f"#/tables/{index}"
+                main_text.append(
+                    Ref(
+                        name=element.label,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        ref=ref_str,
+                    ),
+                )
+
+                # Initialise empty table data grid (only empty cells)
+                table_data = [
+                    [
+                        TableCell(
+                            text="",
+                            # bbox=[0,0,0,0],
+                            spans=[[i, j]],
+                            obj_type="body",
+                        )
+                        for j in range(element.num_cols)
+                    ]
+                    for i in range(element.num_rows)
+                ]
+
+                # Overwrite cells in table data for which there is actual cell content.
+                for cell in element.table_cells:
+                    for i in range(
+                        min(cell.start_row_offset_idx, element.num_rows),
+                        min(cell.end_row_offset_idx, element.num_rows),
+                    ):
+                        for j in range(
+                            min(cell.start_col_offset_idx, element.num_cols),
+                            min(cell.end_col_offset_idx, element.num_cols),
+                        ):
+                            celltype = "body"
+                            if cell.column_header:
+                                celltype = "col_header"
+                            elif cell.row_header:
+                                celltype = "row_header"
+                            elif cell.row_section:
+                                celltype = "row_section"
+
+                            def make_spans(cell):
+                                for rspan in range(
+                                    min(cell.start_row_offset_idx, element.num_rows),
+                                    min(cell.end_row_offset_idx, element.num_rows),
+                                ):
+                                    for cspan in range(
+                                        min(
+                                            cell.start_col_offset_idx, element.num_cols
+                                        ),
+                                        min(cell.end_col_offset_idx, element.num_cols),
+                                    ):
+                                        yield [rspan, cspan]
+
+                            spans = list(make_spans(cell))
+                            if cell.bbox is not None:
+                                bbox = cell.bbox.to_bottom_left_origin(
+                                    page_no_to_page[element.page_no].size.height
+                                ).as_tuple()
+                            else:
+                                bbox = None
+
+                            table_data[i][j] = TableCell(
+                                text=cell.text,
+                                bbox=bbox,
+                                # col=j,
+                                # row=i,
+                                spans=spans,
+                                obj_type=celltype,
+                                # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
+                                # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
+                            )
+
+                tables.append(
+                    DsSchemaTable(
+                        num_cols=element.num_cols,
+                        num_rows=element.num_rows,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        data=table_data,
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no + 1,
+                                span=[0, 0],
+                            )
+                        ],
+                    )
+                )
+
+            elif isinstance(element, FigureElement):
+                index = len(figures)
+                ref_str = f"#/figures/{index}"
+                main_text.append(
+                    Ref(
+                        name=element.label,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        ref=ref_str,
+                    ),
+                )
+                figures.append(
+                    Figure(
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no + 1,
+                                span=[0, 0],
+                            )
+                        ],
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        # data=[[]],
+                    )
+                )
+
+        page_dimensions = [
+            PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
+            for p in conv_res.pages
+        ]
+
+        ds_doc: DsDocument = DsDocument(
+            name=title,
+            description=desc,
+            file_info=file_info,
+            main_text=main_text,
+            tables=tables,
+            figures=figures,
+            page_dimensions=page_dimensions,
+        )
+
+        return ds_doc
+
+    def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
+        ds_doc = self._to_legacy_document(conv_res)
        ds_doc_dict = ds_doc.model_dump(by_alias=True)

        glm_doc = self.model.apply_on_doc(ds_doc_dict)
-        ds_doc_dict = to_legacy_document_format(
-            glm_doc, ds_doc_dict, update_name_label=True
-        )

-        exported_doc = DsDocument.model_validate(ds_doc_dict)
+        docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental

        # DEBUG code:
        def draw_clusters_and_cells(ds_document, page_no):
@@ -48,7 +237,7 @@ class GlmModel:
                    if arr == "tables":
                        prov = ds_document.tables[index].prov[0]
                    elif arr == "figures":
-                        prov = ds_document.figures[index].prov[0]
+                        prov = ds_document.pictures[index].prov[0]
                    else:
                        prov = None

@@ -83,4 +272,4 @@ class GlmModel:
        # draw_clusters_and_cells(ds_doc, 0)
        # draw_clusters_and_cells(exported_doc, 0)

-        return exported_doc
+        return docling_doc
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -2,8 +2,9 @@ import logging
 from typing import Iterable

 import numpy
+from docling_core.types.doc import BoundingBox, CoordOrigin

-from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.pipeline_options import EasyOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel

@@ -39,6 +40,8 @@ class EasyOcrModel(BaseOcrModel):
            return

        for page in page_batch:
+            assert page._backend is not None
+
            ocr_rects = self.get_ocr_rects(page)

            all_ocr_cells = []
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -2,8 +2,10 @@ import copy
 import logging
 import random
 import time
+from pathlib import Path
 from typing import Iterable, List

+from docling_core.types.doc import CoordOrigin, DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import ImageDraw

@@ -11,74 +13,73 @@ from docling.datamodel.base_models import (
    BoundingBox,
    Cell,
    Cluster,
-    CoordOrigin,
    LayoutPrediction,
    Page,
 )
+from docling.models.base_model import BasePageModel
 from docling.utils import layout_utils as lu

 _log = logging.getLogger(__name__)


-class LayoutModel:
+class LayoutModel(BasePageModel):

    TEXT_ELEM_LABELS = [
-        "Text",
-        "Footnote",
-        "Caption",
-        "Checkbox-Unselected",
-        "Checkbox-Selected",
-        "Section-header",
-        "Page-header",
-        "Page-footer",
-        "Code",
-        "List-item",
-        # "Title"
+        DocItemLabel.TEXT,
+        DocItemLabel.FOOTNOTE,
+        DocItemLabel.CAPTION,
+        DocItemLabel.CHECKBOX_UNSELECTED,
+        DocItemLabel.CHECKBOX_SELECTED,
+        DocItemLabel.SECTION_HEADER,
+        DocItemLabel.PAGE_HEADER,
+        DocItemLabel.PAGE_FOOTER,
+        DocItemLabel.CODE,
+        DocItemLabel.LIST_ITEM,
        # "Formula",
    ]
-    PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
+    PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]

-    TABLE_LABEL = "Table"
-    FIGURE_LABEL = "Picture"
-    FORMULA_LABEL = "Formula"
+    TABLE_LABEL = DocItemLabel.TABLE
+    FIGURE_LABEL = DocItemLabel.PICTURE
+    FORMULA_LABEL = DocItemLabel.FORMULA

-    def __init__(self, config):
-        self.config = config
-        self.layout_predictor = LayoutPredictor(
-            config["artifacts_path"]
-        )  # TODO temporary
+    def __init__(self, artifacts_path: Path):
+        self.layout_predictor = LayoutPredictor(artifacts_path)  # TODO temporary

-    def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
+    def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
        MIN_INTERSECTION = 0.2
        CLASS_THRESHOLDS = {
-            "Caption": 0.35,
-            "Footnote": 0.35,
-            "Formula": 0.35,
-            "List-item": 0.35,
-            "Page-footer": 0.35,
-            "Page-header": 0.35,
-            "Picture": 0.2,  # low threshold adjust to capture chemical structures for examples.
-            "Section-header": 0.45,
-            "Table": 0.35,
-            "Text": 0.45,
-            "Title": 0.45,
-            "Document Index": 0.45,
-            "Code": 0.45,
-            "Checkbox-Selected": 0.45,
-            "Checkbox-Unselected": 0.45,
-            "Form": 0.45,
-            "Key-Value Region": 0.45,
+            DocItemLabel.CAPTION: 0.35,
+            DocItemLabel.FOOTNOTE: 0.35,
+            DocItemLabel.FORMULA: 0.35,
+            DocItemLabel.LIST_ITEM: 0.35,
+            DocItemLabel.PAGE_FOOTER: 0.35,
+            DocItemLabel.PAGE_HEADER: 0.35,
+            DocItemLabel.PICTURE: 0.2,  # low threshold adjust to capture chemical structures for examples.
+            DocItemLabel.SECTION_HEADER: 0.45,
+            DocItemLabel.TABLE: 0.35,
+            DocItemLabel.TEXT: 0.45,
+            DocItemLabel.TITLE: 0.45,
+            DocItemLabel.DOCUMENT_INDEX: 0.45,
+            DocItemLabel.CODE: 0.45,
+            DocItemLabel.CHECKBOX_SELECTED: 0.45,
+            DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
+            DocItemLabel.FORM: 0.45,
+            DocItemLabel.KEY_VALUE_REGION: 0.45,
        }

-        CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
+        CLASS_REMAPPINGS = {
+            DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
+            DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
+        }

        _log.debug("================= Start postprocess function ====================")
        start_time = time.time()
        # Apply Confidence Threshold to cluster predictions
        # confidence = self.conf_threshold
-        clusters_out = []
+        clusters_mod = []

-        for cluster in clusters:
+        for cluster in clusters_in:
            confidence = CLASS_THRESHOLDS[cluster.label]
            if cluster.confidence >= confidence:
                # annotation["created_by"] = "high_conf_pred"
@@ -86,10 +87,10 @@ class LayoutModel:
                # Remap class labels where needed.
                if cluster.label in CLASS_REMAPPINGS.keys():
                    cluster.label = CLASS_REMAPPINGS[cluster.label]
-                clusters_out.append(cluster)
+                clusters_mod.append(cluster)

        # map to dictionary clusters and cells, with bottom left origin
-        clusters = [
+        clusters_orig = [
            {
                "id": c.id,
                "bbox": list(
@@ -99,7 +100,7 @@ class LayoutModel:
                "cell_ids": [],
                "type": c.label,
            }
-            for c in clusters
+            for c in clusters_in
        ]

        clusters_out = [
@@ -113,9 +114,11 @@ class LayoutModel:
                "cell_ids": [],
                "type": c.label,
            }
-            for c in clusters_out
+            for c in clusters_mod
        ]

+        del clusters_mod
+
        raw_cells = [
            {
                "id": c.id,
@@ -149,7 +152,7 @@ class LayoutModel:

        # Assign orphan cells with lower confidence predictions
        clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
-            clusters_out, clusters, raw_cells, orphan_cell_indices
+            clusters_out, clusters_orig, raw_cells, orphan_cell_indices
        )

        # Refresh the cell_ids assignment, after creating new clusters using low conf predictions
@@ -178,7 +181,7 @@ class LayoutModel:
        ) = lu.cell_id_state_map(clusters_out, cell_count)

        clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
-            clusters_out, clusters, raw_cells, orphan_cell_indices
+            clusters_out, clusters_orig, raw_cells, orphan_cell_indices
        )

        _log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
@@ -237,46 +240,55 @@ class LayoutModel:
        end_time = time.time() - start_time
        _log.debug(f"Finished post processing in seconds={end_time:.3f}")

-        cells_out = [
+        cells_out_new = [
            Cell(
-                id=c["id"],
+                id=c["id"],  # type: ignore
                bbox=BoundingBox.from_tuple(
-                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT  # type: ignore
                ).to_top_left_origin(page_height),
-                text=c["text"],
+                text=c["text"],  # type: ignore
            )
            for c in cells_out
        ]
+
+        del cells_out
+
        clusters_out_new = []
        for c in clusters_out:
-            cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
+            cluster_cells = [
+                ccell for ccell in cells_out_new if ccell.id in c["cell_ids"]  # type: ignore
+            ]
            c_new = Cluster(
-                id=c["id"],
+                id=c["id"],  # type: ignore
                bbox=BoundingBox.from_tuple(
-                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT  # type: ignore
                ).to_top_left_origin(page_height),
-                confidence=c["confidence"],
-                label=c["type"],
+                confidence=c["confidence"],  # type: ignore
+                label=DocItemLabel(c["type"]),
                cells=cluster_cells,
            )
            clusters_out_new.append(c_new)

-        return clusters_out_new, cells_out
+        return clusters_out_new, cells_out_new

    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for page in page_batch:
+            assert page.size is not None
+
            clusters = []
            for ix, pred_item in enumerate(
                self.layout_predictor.predict(page.get_image(scale=1.0))
            ):
+                label = DocItemLabel(
+                    pred_item["label"].lower().replace(" ", "_").replace("-", "_")
+                )  # Temporary, until docling-ibm-model uses docling-core types
                cluster = Cluster(
                    id=ix,
-                    label=pred_item["label"],
+                    label=label,
                    confidence=pred_item["confidence"],
                    bbox=BoundingBox.model_validate(pred_item),
                    cells=[],
                )
-
                clusters.append(cluster)

            # Map cells to clusters
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@@ -2,22 +2,29 @@ import logging
 import re
 from typing import Iterable, List

+from pydantic import BaseModel
+
 from docling.datamodel.base_models import (
    AssembledUnit,
    FigureElement,
    Page,
    PageElement,
-    TableElement,
+    Table,
    TextElement,
 )
+from docling.models.base_model import BasePageModel
 from docling.models.layout_model import LayoutModel

 _log = logging.getLogger(__name__)


-class PageAssembleModel:
-    def __init__(self, config):
-        self.config = config
+class PageAssembleOptions(BaseModel):
+    keep_images: bool = False
+
+
+class PageAssembleModel(BasePageModel):
+    def __init__(self, options: PageAssembleOptions):
+        self.options = options

    def sanitize_text(self, lines):
        if len(lines) <= 1:
@@ -46,6 +53,8 @@ class PageAssembleModel:

    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for page in page_batch:
+            assert page._backend is not None
+            assert page.predictions.layout is not None
            # assembles some JSON output page by page.

            elements: List[PageElement] = []
@@ -84,7 +93,7 @@ class PageAssembleModel:
                    if (
                        not tbl
                    ):  # fallback: add table without structure, if it isn't present
-                        tbl = TableElement(
+                        tbl = Table(
                            label=cluster.label,
                            id=cluster.id,
                            text="",
@@ -145,4 +154,11 @@ class PageAssembleModel:
                elements=elements, headers=headers, body=body
            )

+            # Remove page images (can be disabled)
+            if not self.options.keep_images:
+                page._image_cache = {}
+
+            # Unload backend
+            page._backend.unload()
+
            yield page
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@@ -0,0 +1,57 @@
+from typing import Iterable, Optional
+
+from PIL import ImageDraw
+from pydantic import BaseModel
+
+from docling.datamodel.base_models import Page
+from docling.models.base_model import BasePageModel
+
+
+class PagePreprocessingOptions(BaseModel):
+    images_scale: Optional[float]
+
+
+class PagePreprocessingModel(BasePageModel):
+    def __init__(self, options: PagePreprocessingOptions):
+        self.options = options
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        for page in page_batch:
+            page = self._populate_page_images(page)
+            page = self._parse_page_cells(page)
+            yield page
+
+    # Generate the page image and store it in the page object
+    def _populate_page_images(self, page: Page) -> Page:
+        # default scale
+        page.get_image(
+            scale=1.0
+        )  # puts the page image on the image cache at default scale
+
+        images_scale = self.options.images_scale
+        # user requested scales
+        if images_scale is not None:
+            page._default_image_scale = images_scale
+            page.get_image(
+                scale=images_scale
+            )  # this will trigger storing the image in the internal cache
+
+        return page
+
+    # Extract and populate the page cells and store it in the page object
+    def _parse_page_cells(self, page: Page) -> Page:
+        assert page._backend is not None
+
+        page.cells = list(page._backend.get_text_cells())
+
+        # DEBUG code:
+        def draw_text_boxes(image, cells):
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+            image.show()
+
+        # draw_text_boxes(page.get_image(scale=1.0), cells)
+
+        return page
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -3,29 +3,25 @@ from pathlib import Path
 from typing import Iterable, List

 import numpy
+from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw

-from docling.datamodel.base_models import (
-    BoundingBox,
-    Page,
-    TableCell,
-    TableElement,
-    TableStructurePrediction,
-)
-from docling.datamodel.pipeline_options import TableFormerMode
+from docling.datamodel.base_models import Page, Table, TableStructurePrediction
+from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
+from docling.models.base_model import BasePageModel


-class TableStructureModel:
-    def __init__(self, config):
-        self.config = config
-        self.do_cell_matching = config["do_cell_matching"]
-        self.mode = config["mode"]
+class TableStructureModel(BasePageModel):
+    def __init__(
+        self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
+    ):
+        self.options = options
+        self.do_cell_matching = self.options.do_cell_matching
+        self.mode = self.options.mode

-        self.enabled = config["enabled"]
+        self.enabled = enabled
        if self.enabled:
-            artifacts_path: Path = config["artifacts_path"]
-
            if self.mode == TableFormerMode.ACCURATE:
                artifacts_path = artifacts_path / "fat"

@@ -39,7 +35,9 @@ class TableStructureModel:
            self.tf_predictor = TFPredictor(self.tm_config)
            self.scale = 2.0  # Scale up table input images to 144 dpi

-    def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
+    def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
+        assert page._backend is not None
+
        image = (
            page._backend.get_page_image()
        )  # make new image to avoid drawing on the saved ones
@@ -50,17 +48,18 @@ class TableStructureModel:
            draw.rectangle([(x0, y0), (x1, y1)], outline="red")

            for tc in table_element.table_cells:
-                x0, y0, x1, y1 = tc.bbox.as_tuple()
-                if tc.column_header:
-                    width = 3
-                else:
-                    width = 1
-                draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
-                draw.text(
-                    (x0 + 3, y0 + 3),
-                    text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
-                    fill="black",
-                )
+                if tc.bbox is not None:
+                    x0, y0, x1, y1 = tc.bbox.as_tuple()
+                    if tc.column_header:
+                        width = 3
+                    else:
+                        width = 1
+                    draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
+                    draw.text(
+                        (x0 + 3, y0 + 3),
+                        text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
+                        fill="black",
+                    )

        image.show()

@@ -71,6 +70,9 @@ class TableStructureModel:
            return

        for page in page_batch:
+            assert page._backend is not None
+            assert page.predictions.layout is not None
+            assert page.size is not None

            page.predictions.tablestructure = TableStructurePrediction()  # dummy

@@ -85,7 +87,7 @@ class TableStructureModel:
                    ],
                )
                for cluster in page.predictions.layout.clusters
-                if cluster.label == "Table"
+                if cluster.label == DocItemLabel.TABLE
            ]
            if not len(in_tables):
                yield page
@@ -132,7 +134,7 @@ class TableStructureModel:
                            element["bbox"]["token"] = text_piece

                        tc = TableCell.model_validate(element)
-                        if self.do_cell_matching:
+                        if self.do_cell_matching and tc.bbox is not None:
                            tc.bbox = tc.bbox.scaled(1 / self.scale)
                        table_cells.append(tc)

@@ -141,7 +143,7 @@ class TableStructureModel:
                    num_cols = table_out["predict_details"]["num_cols"]
                    otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]

-                    tbl = TableElement(
+                    tbl = Table(
                        otsl_seq=otsl_seq,
                        table_cells=table_cells,
                        num_rows=num_rows,
@@ -149,7 +151,7 @@ class TableStructureModel:
                        id=table_cluster.id,
                        page_no=page.page_no,
                        cluster=table_cluster,
-                        label="Table",
+                        label=DocItemLabel.TABLE,
                    )

                    page.predictions.tablestructure.table_map[table_cluster.id] = tbl
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -2,11 +2,12 @@ import io
 import logging
 import tempfile
 from subprocess import DEVNULL, PIPE, Popen
-from typing import Iterable, Tuple
+from typing import Iterable, Optional, Tuple

 import pandas as pd
+from docling_core.types.doc import BoundingBox, CoordOrigin

-from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel

@@ -21,8 +22,8 @@ class TesseractOcrCliModel(BaseOcrModel):

        self.scale = 3  # multiplier for 72 dpi == 216 dpi.

-        self._name = None
-        self._version = None
+        self._name: Optional[str] = None
+        self._version: Optional[str] = None

        if self.enabled:
            try:
@@ -39,7 +40,7 @@ class TesseractOcrCliModel(BaseOcrModel):
    def _get_name_and_version(self) -> Tuple[str, str]:

        if self._name != None and self._version != None:
-            return self._name, self._version
+            return self._name, self._version  # type: ignore

        cmd = [self.options.tesseract_cmd, "--version"]

@@ -108,6 +109,8 @@ class TesseractOcrCliModel(BaseOcrModel):
            return

        for page in page_batch:
+            assert page._backend is not None
+
            ocr_rects = self.get_ocr_rects(page)

            all_ocr_cells = []
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -1,9 +1,9 @@
 import logging
 from typing import Iterable

-import numpy
+from docling_core.types.doc import BoundingBox, CoordOrigin

-from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel

@@ -68,6 +68,9 @@ class TesseractOcrModel(BaseOcrModel):
            return

        for page in page_batch:
+            assert page._backend is not None
+            assert self.reader is not None
+
            ocr_rects = self.get_ocr_rects(page)

            all_ocr_cells = []
--- a/docling/pipeline/base_model_pipeline.py
+++ b/docling/pipeline/base_model_pipeline.py
@@ -1,18 +0,0 @@
-from pathlib import Path
-from typing import Callable, Iterable, List
-
-from docling.datamodel.base_models import Page
-from docling.datamodel.pipeline_options import PipelineOptions
-
-
-class BaseModelPipeline:
-    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
-        self.model_pipe: List[Callable] = []
-        self.artifacts_path = artifacts_path
-        self.pipeline_options = pipeline_options
-
-    def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
-        for model in self.model_pipe:
-            page_batch = model(page_batch)
-
-        yield from page_batch
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@@ -0,0 +1,190 @@
+import functools
+import logging
+import time
+import traceback
+from abc import ABC, abstractmethod
+from typing import Callable, Iterable, List
+
+from docling_core.types.doc import DoclingDocument, NodeItem
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    DoclingComponentType,
+    ErrorItem,
+    Page,
+)
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import PipelineOptions
+from docling.datamodel.settings import settings
+from docling.models.base_model import BaseEnrichmentModel
+from docling.utils.utils import chunkify
+
+_log = logging.getLogger(__name__)
+
+
+class BasePipeline(ABC):
+    def __init__(self, pipeline_options: PipelineOptions):
+        self.pipeline_options = pipeline_options
+        self.build_pipe: List[Callable] = []
+        self.enrichment_pipe: List[BaseEnrichmentModel] = []
+
+    def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
+        conv_res = ConversionResult(input=in_doc)
+
+        _log.info(f"Processing document {in_doc.file.name}")
+        try:
+            # These steps are building and assembling the structure of the
+            # output DoclingDocument
+            conv_res = self._build_document(in_doc, conv_res)
+            conv_res = self._assemble_document(in_doc, conv_res)
+            # From this stage, all operations should rely only on conv_res.output
+            conv_res = self._enrich_document(in_doc, conv_res)
+            conv_res.status = self._determine_status(in_doc, conv_res)
+        except Exception as e:
+            conv_res.status = ConversionStatus.FAILURE
+            if raises_on_error:
+                raise e
+
+        return conv_res
+
+    @abstractmethod
+    def _build_document(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionResult:
+        pass
+
+    def _assemble_document(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionResult:
+        return conv_res
+
+    def _enrich_document(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionResult:
+
+        def _filter_elements(
+            doc: DoclingDocument, model: BaseEnrichmentModel
+        ) -> Iterable[NodeItem]:
+            for element, _level in doc.iterate_items():
+                if model.is_processable(doc=doc, element=element):
+                    yield element
+
+        for model in self.enrichment_pipe:
+            for element_batch in chunkify(
+                _filter_elements(conv_res.document, model),
+                settings.perf.elements_batch_size,
+            ):
+                # TODO: currently we assume the element itself is modified, because
+                # we don't have an interface to save the element back to the document
+                for element in model(
+                    doc=conv_res.document, element_batch=element_batch
+                ):  # Must exhaust!
+                    pass
+
+        return conv_res
+
+    @abstractmethod
+    def _determine_status(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionStatus:
+        pass
+
+    @classmethod
+    @abstractmethod
+    def get_default_options(cls) -> PipelineOptions:
+        pass
+
+    @classmethod
+    @abstractmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        pass
+
+    # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
+    #    for model in self.build_pipe:
+    #        element_batch = model(element_batch)
+    #
+    #    yield from element_batch
+
+
+class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
+
+    def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        for model in self.build_pipe:
+            page_batch = model(page_batch)
+
+        yield from page_batch
+
+    def _build_document(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionResult:
+
+        if not isinstance(in_doc._backend, PdfDocumentBackend):
+            raise RuntimeError(
+                f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
+                f"Can not convert this with a PDF pipeline. "
+                f"Please check your format configuration on DocumentConverter."
+            )
+            # conv_res.status = ConversionStatus.FAILURE
+            # return conv_res
+
+        for i in range(0, in_doc.page_count):
+            conv_res.pages.append(Page(page_no=i))
+
+        try:
+            # Iterate batches of pages (page_batch_size) in the doc
+            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
+                start_pb_time = time.time()
+
+                # 1. Initialise the page resources
+                init_pages = map(
+                    functools.partial(self.initialize_page, in_doc), page_batch
+                )
+
+                # 2. Run pipeline stages
+                pipeline_pages = self._apply_on_pages(init_pages)
+
+                for p in pipeline_pages:  # Must exhaust!
+                    pass
+
+                end_pb_time = time.time() - start_pb_time
+                _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
+
+        except Exception as e:
+            conv_res.status = ConversionStatus.FAILURE
+            trace = "\n".join(traceback.format_exception(e))
+            _log.warning(
+                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
+                f"{trace}"
+            )
+            raise e
+
+        finally:
+            # Always unload the PDF backend, even in case of failure
+            if in_doc._backend:
+                in_doc._backend.unload()
+
+        return conv_res
+
+    def _determine_status(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionStatus:
+        status = ConversionStatus.SUCCESS
+        for page in conv_res.pages:
+            if page._backend is None or not page._backend.is_valid():
+                conv_res.errors.append(
+                    ErrorItem(
+                        component_type=DoclingComponentType.DOCUMENT_BACKEND,
+                        module_name=type(page._backend).__name__,
+                        error_message=f"Page {page.page_no} failed to parse.",
+                    )
+                )
+                status = ConversionStatus.PARTIAL_SUCCESS
+
+        return status
+
+    # Initialise and load resources for a page
+    @abstractmethod
+    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+        pass
--- a/docling/pipeline/simple_pipeline.py
+++ b/docling/pipeline/simple_pipeline.py
@@ -0,0 +1,59 @@
+import logging
+
+from docling.backend.abstract_backend import (
+    AbstractDocumentBackend,
+    DeclarativeDocumentBackend,
+)
+from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import PipelineOptions
+from docling.pipeline.base_pipeline import BasePipeline
+
+_log = logging.getLogger(__name__)
+
+
+class SimplePipeline(BasePipeline):
+    """SimpleModelPipeline.
+
+    This class is used at the moment for formats / backends
+    which produce straight DoclingDocument output.
+    """
+
+    def __init__(self, pipeline_options: PipelineOptions):
+        super().__init__(pipeline_options)
+
+    def _build_document(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionResult:
+
+        if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
+            raise RuntimeError(
+                f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
+                f"Can not convert this with simple pipeline. "
+                f"Please check your format configuration on DocumentConverter."
+            )
+            # conv_res.status = ConversionStatus.FAILURE
+            # return conv_res
+
+        # Instead of running a page-level pipeline to build up the document structure,
+        # the backend is expected to be of type DeclarativeDocumentBackend, which can output
+        # a DoclingDocument straight.
+
+        conv_res.document = in_doc._backend.convert()
+        return conv_res
+
+    def _determine_status(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionStatus:
+        # This is called only if the previous steps didn't raise.
+        # Since we don't have anything else to evaluate, we can
+        # safely return SUCCESS.
+        return ConversionStatus.SUCCESS
+
+    @classmethod
+    def get_default_options(cls) -> PipelineOptions:
+        return PipelineOptions()
+
+    @classmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        return isinstance(backend, DeclarativeDocumentBackend)
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@@ -1,66 +0,0 @@
-from pathlib import Path
-
-from docling.datamodel.pipeline_options import (
-    EasyOcrOptions,
-    PipelineOptions,
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
-)
-from docling.models.base_ocr_model import BaseOcrModel
-from docling.models.easyocr_model import EasyOcrModel
-from docling.models.layout_model import LayoutModel
-from docling.models.table_structure_model import TableStructureModel
-from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
-from docling.models.tesseract_ocr_model import TesseractOcrModel
-from docling.pipeline.base_model_pipeline import BaseModelPipeline
-
-
-class StandardModelPipeline(BaseModelPipeline):
-    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
-    _table_model_path = "model_artifacts/tableformer"
-
-    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
-        super().__init__(artifacts_path, pipeline_options)
-
-        ocr_model: BaseOcrModel
-        if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
-            ocr_model = EasyOcrModel(
-                enabled=pipeline_options.do_ocr,
-                options=pipeline_options.ocr_options,
-            )
-        elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
-            ocr_model = TesseractOcrCliModel(
-                enabled=pipeline_options.do_ocr,
-                options=pipeline_options.ocr_options,
-            )
-        elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
-            ocr_model = TesseractOcrModel(
-                enabled=pipeline_options.do_ocr,
-                options=pipeline_options.ocr_options,
-            )
-        else:
-            raise RuntimeError(
-                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
-            )
-
-        self.model_pipe = [
-            # OCR
-            ocr_model,
-            # Layout
-            LayoutModel(
-                config={
-                    "artifacts_path": artifacts_path
-                    / StandardModelPipeline._layout_model_path
-                }
-            ),
-            # Table structure
-            TableStructureModel(
-                config={
-                    "artifacts_path": artifacts_path
-                    / StandardModelPipeline._table_model_path,
-                    "enabled": pipeline_options.do_table_structure,
-                    "mode": pipeline_options.table_structure_options.mode,
-                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
-                }
-            ),
-        ]
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -0,0 +1,198 @@
+import logging
+from pathlib import Path
+from typing import Optional
+
+from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.datamodel.base_models import AssembledUnit, Page
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PdfPipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.models.ds_glm_model import GlmModel, GlmOptions
+from docling.models.easyocr_model import EasyOcrModel
+from docling.models.layout_model import LayoutModel
+from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
+from docling.models.page_preprocessing_model import (
+    PagePreprocessingModel,
+    PagePreprocessingOptions,
+)
+from docling.models.table_structure_model import TableStructureModel
+from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
+from docling.models.tesseract_ocr_model import TesseractOcrModel
+from docling.pipeline.base_pipeline import PaginatedPipeline
+
+_log = logging.getLogger(__name__)
+
+
+class StandardPdfPipeline(PaginatedPipeline):
+    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
+    _table_model_path = "model_artifacts/tableformer"
+
+    def __init__(self, pipeline_options: PdfPipelineOptions):
+        super().__init__(pipeline_options)
+        self.pipeline_options: PdfPipelineOptions
+
+        if pipeline_options.artifacts_path is None:
+            self.artifacts_path = self.download_models_hf()
+        else:
+            self.artifacts_path = Path(pipeline_options.artifacts_path)
+
+        keep_images = (
+            self.pipeline_options.generate_page_images
+            or self.pipeline_options.generate_picture_images
+            or self.pipeline_options.generate_table_images
+        )
+
+        self.glm_model = GlmModel(options=GlmOptions())
+
+        if (ocr_model := self.get_ocr_model()) is None:
+            raise RuntimeError(
+                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
+            )
+
+        self.build_pipe = [
+            # Pre-processing
+            PagePreprocessingModel(
+                options=PagePreprocessingOptions(
+                    images_scale=pipeline_options.images_scale
+                )
+            ),
+            # OCR
+            ocr_model,
+            # Layout model
+            LayoutModel(
+                artifacts_path=self.artifacts_path
+                / StandardPdfPipeline._layout_model_path
+            ),
+            # Table structure model
+            TableStructureModel(
+                enabled=pipeline_options.do_table_structure,
+                artifacts_path=self.artifacts_path
+                / StandardPdfPipeline._table_model_path,
+                options=pipeline_options.table_structure_options,
+            ),
+            # Page assemble
+            PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
+        ]
+
+        self.enrichment_pipe = [
+            # Other models working on `NodeItem` elements in the DoclingDocument
+        ]
+
+    @staticmethod
+    def download_models_hf(
+        local_dir: Optional[Path] = None, force: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v2.0.1",
+        )
+
+        return Path(download_path)
+
+    def get_ocr_model(self) -> Optional[BaseOcrModel]:
+        if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
+            return EasyOcrModel(
+                enabled=self.pipeline_options.do_ocr,
+                options=self.pipeline_options.ocr_options,
+            )
+        elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
+            return TesseractOcrCliModel(
+                enabled=self.pipeline_options.do_ocr,
+                options=self.pipeline_options.ocr_options,
+            )
+        elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
+            return TesseractOcrModel(
+                enabled=self.pipeline_options.do_ocr,
+                options=self.pipeline_options.ocr_options,
+            )
+        return None
+
+    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+        page._backend = doc._backend.load_page(page.page_no)  # type: ignore
+        if page._backend is not None and page._backend.is_valid():
+            page.size = page._backend.get_size()
+
+        return page
+
+    def _assemble_document(
+        self, in_doc: InputDocument, conv_res: ConversionResult
+    ) -> ConversionResult:
+        all_elements = []
+        all_headers = []
+        all_body = []
+
+        for p in conv_res.pages:
+            assert p.assembled is not None
+            for el in p.assembled.body:
+                all_body.append(el)
+            for el in p.assembled.headers:
+                all_headers.append(el)
+            for el in p.assembled.elements:
+                all_elements.append(el)
+
+        conv_res.assembled = AssembledUnit(
+            elements=all_elements, headers=all_headers, body=all_body
+        )
+
+        conv_res.document = self.glm_model(conv_res)
+
+        # Generate page images in the output
+        if self.pipeline_options.generate_page_images:
+            for page in conv_res.pages:
+                assert page.image is not None
+                page_no = page.page_no + 1
+                conv_res.document.pages[page_no].image = ImageRef.from_pil(
+                    page.image, dpi=int(72 * self.pipeline_options.images_scale)
+                )
+
+        # Generate images of the requested element types
+        if (
+            self.pipeline_options.generate_picture_images
+            or self.pipeline_options.generate_table_images
+        ):
+            scale = self.pipeline_options.images_scale
+            for element, _level in conv_res.document.iterate_items():
+                if not isinstance(element, DocItem) or len(element.prov) == 0:
+                    continue
+                if (
+                    isinstance(element, PictureItem)
+                    and self.pipeline_options.generate_picture_images
+                ) or (
+                    isinstance(element, TableItem)
+                    and self.pipeline_options.generate_table_images
+                ):
+                    page_ix = element.prov[0].page_no - 1
+                    page = conv_res.pages[page_ix]
+                    assert page.size is not None
+                    assert page.image is not None
+
+                    crop_bbox = (
+                        element.prov[0]
+                        .bbox.scaled(scale=scale)
+                        .to_top_left_origin(page_height=page.size.height * scale)
+                    )
+
+                    cropped_im = page.image.crop(crop_bbox.as_tuple())
+                    element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
+
+        return conv_res
+
+    @classmethod
+    def get_default_options(cls) -> PdfPipelineOptions:
+        return PdfPipelineOptions()
+
+    @classmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        return isinstance(backend, PdfDocumentBackend)
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@@ -1,9 +1,10 @@
 import logging
 from typing import Any, Dict, Iterable, List, Tuple, Union

-from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table

-from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
+from docling.datamodel.base_models import OcrCell
 from docling.datamodel.document import ConversionResult, Page

 _log = logging.getLogger(__name__)
@@ -40,7 +41,7 @@ def generate_multimodal_pages(
    end_ix = 0
    doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []

-    doc = doc_result.output
+    doc = doc_result.legacy_document

    def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
        segments = []
--- a/docling/utils/layout_utils.py
+++ b/docling/utils/layout_utils.py
@@ -2,6 +2,7 @@ import copy
 import logging

 import networkx as nx
+from docling_core.types.doc import DocItemLabel

 logger = logging.getLogger("layout_utils")

@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
            "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
        )
        logger.debug("  with cells: " + str(new_cluster["cell_ids"]))
-        if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
+        if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
            logger.debug("  Empty non-picture, removed")
            continue  ## Skip this former cluster, now without cells.
        new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):


 def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
-    if not (cluster["type"] in ["Table", "Picture"]):
+    if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
        ## A text-like cluster. The bbox only needs to be around the text cells:
        logger.debug("    Initial bbox: " + str(cluster["bbox"]))
        new_bbox = surrounding_list(
            [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
        )
        logger.debug("  New bounding box:" + str(new_bbox))
-    if cluster["type"] == "Picture":
+    if cluster["type"] == DocItemLabel.PICTURE:
        ## We only make the bbox completely comprise included text cells:
        logger.debug("  Picture")
        if len(cluster["cell_ids"]) != 0:
@@ -587,7 +588,7 @@ def set_orphan_as_text(
    max_id = -1
    figures = []
    for cluster in cluster_predictions:
-        if cluster["type"] == "Picture":
+        if cluster["type"] == DocItemLabel.PICTURE:
            figures.append(cluster)

        if cluster["id"] > max_id:
@@ -638,13 +639,13 @@ def set_orphan_as_text(
            # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
            if fig_flag == False and lines_detector == False:
                # get class from low confidence detections if not set as text:
-                class_type = "Text"
+                class_type = DocItemLabel.TEXT

                for cluster in cluster_predictions_low:
                    intersection = compute_intersection(
                        orph_cell["bbox"], cluster["bbox"]
                    )
-                    class_type = "Text"
+                    class_type = DocItemLabel.TEXT
                    if (
                        cluster["confidence"] > 0.1
                        and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -718,7 +719,9 @@ def merge_cells(cluster_predictions):
                    if cluster["id"] == node:
                        lines.append(cluster)
                        cluster_predictions.remove(cluster)
-            new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
+            new_merged_cluster = build_cluster_from_lines(
+                lines, DocItemLabel.TEXT, max_id
+            )
            cluster_predictions.append(new_merged_cluster)
    return cluster_predictions

@@ -753,9 +756,9 @@ def clean_up_clusters(
                # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
                elif img_table == True:
                    if (
-                        cluster_1["type"] == "Text"
-                        and cluster_2["type"] == "Picture"
-                        or cluster_2["type"] == "Table"
+                        cluster_1["type"] == DocItemLabel.TEXT
+                        and cluster_2["type"] == DocItemLabel.PICTURE
+                        or cluster_2["type"] == DocItemLabel.TABLE
                    ):
                        if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
                            DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -771,7 +774,10 @@ def clean_up_clusters(
                            DuplicateDeletedClusterIDs.append(cluster_1["id"])
            # remove tables that have one pdf cell
            if one_cell_table == True:
-                if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
+                if (
+                    cluster_1["type"] == DocItemLabel.TABLE
+                    and len(cluster_1["cell_ids"]) < 2
+                ):
                    DuplicateDeletedClusterIDs.append(cluster_1["id"])

    DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))