feat!: Docling v2 (#117)

--------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-12-15 16:18:22 +00:00 · 2024-10-16 21:02:03 +02:00
parent d504432c1e
commit 7d3be0edeb
144 changed files with 15180 additions and 3828 deletions
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,18 +1,19 @@
-import copy
-import warnings
 from enum import Enum, auto
 from io import BytesIO
-from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union

-from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict, Field, model_validator
-from typing_extensions import Self
-
-from docling.backend.abstract_backend import PdfPageBackend
-from docling.datamodel.pipeline_options import (  # Must be imported here for backward compatibility.
-    PipelineOptions,
-    TableStructureOptions,
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItemLabel,
+    PictureDataType,
+    Size,
+    TableCell,
 )
+from PIL.Image import Image
+from pydantic import BaseModel, ConfigDict
+
+if TYPE_CHECKING:
+    from docling.backend.pdf_backend import PdfPageBackend


 class ConversionStatus(str, Enum):
@@ -23,18 +24,61 @@ class ConversionStatus(str, Enum):
    PARTIAL_SUCCESS = auto()


+class InputFormat(str, Enum):
+    DOCX = "docx"
+    PPTX = "pptx"
+    HTML = "html"
+    IMAGE = "image"
+    PDF = "pdf"
+
+
+class OutputFormat(str, Enum):
+    MARKDOWN = "md"
+    JSON = "json"
+    TEXT = "text"
+    DOCTAGS = "doctags"
+
+
+FormatToExtensions: Dict[InputFormat, List[str]] = {
+    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
+    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
+    InputFormat.PDF: ["pdf"],
+    InputFormat.HTML: ["html", "htm", "xhtml"],
+    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
+}
+
+FormatToMimeType: Dict[InputFormat, Set[str]] = {
+    InputFormat.DOCX: {
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+    },
+    InputFormat.PPTX: {
+        "application/vnd.openxmlformats-officedocument.presentationml.template",
+        "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    },
+    InputFormat.HTML: {"text/html", "application/xhtml+xml"},
+    InputFormat.IMAGE: {
+        "image/png",
+        "image/jpeg",
+        "image/tiff",
+        "image/gif",
+        "image/bmp",
+    },
+    InputFormat.PDF: {"application/pdf"},
+}
+MimeTypeToFormat = {
+    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
+}
+
+
 class DocInputType(str, Enum):
    PATH = auto()
    STREAM = auto()


-class CoordOrigin(str, Enum):
-    TOPLEFT = auto()
-    BOTTOMLEFT = auto()
-
-
 class DoclingComponentType(str, Enum):
-    PDF_BACKEND = auto()
+    DOCUMENT_BACKEND = auto()
    MODEL = auto()
    DOC_ASSEMBLER = auto()

@@ -45,118 +89,6 @@ class ErrorItem(BaseModel):
    error_message: str


-class PageSize(BaseModel):
-    width: float = 0.0
-    height: float = 0.0
-
-
-class BoundingBox(BaseModel):
-    l: float  # left
-    t: float  # top
-    r: float  # right
-    b: float  # bottom
-
-    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
-
-    @property
-    def width(self):
-        return self.r - self.l
-
-    @property
-    def height(self):
-        return abs(self.t - self.b)
-
-    def scaled(self, scale: float) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l *= scale
-        out_bbox.r *= scale
-        out_bbox.t *= scale
-        out_bbox.b *= scale
-
-        return out_bbox
-
-    def normalized(self, page_size: PageSize) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l /= page_size.width
-        out_bbox.r /= page_size.width
-        out_bbox.t /= page_size.height
-        out_bbox.b /= page_size.height
-
-        return out_bbox
-
-    def as_tuple(self):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return (self.l, self.t, self.r, self.b)
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return (self.l, self.b, self.r, self.t)
-
-    @classmethod
-    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
-        if origin == CoordOrigin.TOPLEFT:
-            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b < t:
-                b, t = t, b
-
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-        elif origin == CoordOrigin.BOTTOMLEFT:
-            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b > t:
-                b, t = t, b
-
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-
-    def area(self) -> float:
-        area = (self.r - self.l) * (self.b - self.t)
-        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            area = -area
-        return area
-
-    def intersection_area_with(self, other: "BoundingBox") -> float:
-        # Calculate intersection coordinates
-        left = max(self.l, other.l)
-        top = max(self.t, other.t)
-        right = min(self.r, other.r)
-        bottom = min(self.b, other.b)
-
-        # Calculate intersection dimensions
-        width = right - left
-        height = bottom - top
-
-        # If the bounding boxes do not overlap, width or height will be negative
-        if width <= 0 or height <= 0:
-            return 0.0
-
-        return width * height
-
-    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
-        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.TOPLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,
-                b=page_height - self.b,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            )
-
-    def to_top_left_origin(self, page_height):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,  # self.b
-                b=page_height - self.b,  # self.t
-                coord_origin=CoordOrigin.TOPLEFT,
-            )
-
-
 class Cell(BaseModel):
    id: int
    text: str
@@ -169,14 +101,14 @@ class OcrCell(Cell):

 class Cluster(BaseModel):
    id: int
-    label: str
+    label: DocItemLabel
    bbox: BoundingBox
    confidence: float = 1.0
    cells: List[Cell] = []


 class BasePageElement(BaseModel):
-    label: str
+    label: DocItemLabel
    id: int
    page_no: int
    cluster: Cluster
@@ -187,37 +119,7 @@ class LayoutPrediction(BaseModel):
    clusters: List[Cluster] = []


-class TableCell(BaseModel):
-    bbox: BoundingBox
-    row_span: int
-    col_span: int
-    start_row_offset_idx: int
-    end_row_offset_idx: int
-    start_col_offset_idx: int
-    end_col_offset_idx: int
-    text: str
-    column_header: bool = False
-    row_header: bool = False
-    row_section: bool = False
-
-    @model_validator(mode="before")
-    @classmethod
-    def from_dict_format(cls, data: Any) -> Any:
-        if isinstance(data, Dict):
-            text = data["bbox"].get("token", "")
-            if not len(text):
-                text_cells = data.pop("text_cell_bboxes", None)
-                if text_cells:
-                    for el in text_cells:
-                        text += el["token"] + " "
-
-                text = text.strip()
-            data["text"] = text
-
-        return data
-
-
-class TableElement(BasePageElement):
+class Table(BasePageElement):
    otsl_seq: List[str]
    num_rows: int = 0
    num_cols: int = 0
@@ -225,18 +127,15 @@ class TableElement(BasePageElement):


 class TableStructurePrediction(BaseModel):
-    table_map: Dict[int, TableElement] = {}
+    table_map: Dict[int, Table] = {}


-class TextElement(BasePageElement): ...
-
-
-class FigureData(BaseModel):
-    pass
+class TextElement(BasePageElement):
+    text: str


 class FigureElement(BasePageElement):
-    data: Optional[FigureData] = None
+    annotations: List[PictureDataType] = []
    provenance: Optional[str] = None
    predicted_class: Optional[str] = None
    confidence: Optional[float] = None
@@ -259,7 +158,7 @@ class PagePredictions(BaseModel):
    equations_prediction: Optional[EquationPrediction] = None


-PageElement = Union[TextElement, TableElement, FigureElement]
+PageElement = Union[TextElement, Table, FigureElement]


 class AssembledUnit(BaseModel):
@@ -272,13 +171,13 @@ class Page(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    page_no: int
-    page_hash: Optional[str] = None
-    size: Optional[PageSize] = None
+    # page_hash: Optional[str] = None
+    size: Optional[Size] = None
    cells: List[Cell] = []
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None

-    _backend: Optional[PdfPageBackend] = (
+    _backend: Optional["PdfPageBackend"] = (
        None  # Internal PDF backend. By default it is cleared during assembling.
    )
    _default_image_scale: float = 1.0  # Default image scale for external usage.
@@ -301,24 +200,5 @@ class Page(BaseModel):
 class DocumentStream(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

-    filename: str
+    name: str
    stream: BytesIO
-
-
-class AssembleOptions(BaseModel):
-    keep_page_images: Annotated[
-        bool,
-        Field(
-            deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
-        ),
-    ] = False  # False: page images are removed in the assemble step
-    images_scale: Optional[float] = None  # if set, the scale for generated images
-
-    @model_validator(mode="after")
-    def set_page_images_from_deprecated(self) -> Self:
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", DeprecationWarning)
-            default_scale = 1.0
-            if self.keep_page_images and self.images_scale is None:
-                self.images_scale = default_scale
-        return self
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -1,87 +1,101 @@
 import logging
+import re
+from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union

-from docling_core.types import BaseCell, BaseText
+import filetype
+from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
-from docling_core.types import TableCell
-from docling_core.types.doc.base import BoundingBox as DsBoundingBox
-from docling_core.types.doc.base import Figure
+from docling_core.types.doc import (
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    PictureItem,
+    SectionHeaderItem,
+    TableItem,
+    TextItem,
+)
+from docling_core.types.doc.document import ListItem
+from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
+from docling_core.utils.file import resolve_file_source
 from pydantic import BaseModel
 from typing_extensions import deprecated

-from docling.backend.abstract_backend import PdfDocumentBackend
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.abstract_backend import (
+    AbstractDocumentBackend,
+    PaginatedDocumentBackend,
+)
 from docling.datamodel.base_models import (
    AssembledUnit,
    ConversionStatus,
    DocumentStream,
    ErrorItem,
-    FigureElement,
+    InputFormat,
+    MimeTypeToFormat,
    Page,
-    PageElement,
-    TableElement,
-    TextElement,
 )
 from docling.datamodel.settings import DocumentLimits
-from docling.utils.utils import create_file_hash
+from docling.utils.utils import create_file_hash, create_hash
+
+if TYPE_CHECKING:
+    from docling.document_converter import FormatOption

 _log = logging.getLogger(__name__)

 layout_label_to_ds_type = {
-    "Title": "title",
-    "Document Index": "table-of-path_or_stream",
-    "Section-header": "subtitle-level-1",
-    "Checkbox-Selected": "checkbox-selected",
-    "Checkbox-Unselected": "checkbox-unselected",
-    "Caption": "caption",
-    "Page-header": "page-header",
-    "Page-footer": "page-footer",
-    "Footnote": "footnote",
-    "Table": "table",
-    "Formula": "equation",
-    "List-item": "paragraph",
-    "Code": "paragraph",
-    "Picture": "figure",
-    "Text": "paragraph",
+    DocItemLabel.TITLE: "title",
+    DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
+    DocItemLabel.SECTION_HEADER: "subtitle-level-1",
+    DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
+    DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
+    DocItemLabel.CAPTION: "caption",
+    DocItemLabel.PAGE_HEADER: "page-header",
+    DocItemLabel.PAGE_FOOTER: "page-footer",
+    DocItemLabel.FOOTNOTE: "footnote",
+    DocItemLabel.TABLE: "table",
+    DocItemLabel.FORMULA: "equation",
+    DocItemLabel.LIST_ITEM: "paragraph",
+    DocItemLabel.CODE: "paragraph",
+    DocItemLabel.PICTURE: "figure",
+    DocItemLabel.TEXT: "paragraph",
+    DocItemLabel.PARAGRAPH: "paragraph",
 }

-_EMPTY_DOC = DsDocument(
-    _name="",
-    description=DsDocumentDescription(logs=[]),
-    file_info=DsFileInfoObject(
-        filename="",
-        document_hash="",
-    ),
-)
+_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")


 class InputDocument(BaseModel):
-    file: PurePath = None
-    document_hash: Optional[str] = None
-    valid: bool = False
+    file: PurePath
+    document_hash: str  # = None
+    valid: bool = True
    limits: DocumentLimits = DocumentLimits()
+    format: InputFormat  # = None

    filesize: Optional[int] = None
-    page_count: Optional[int] = None
+    page_count: int = 0

-    _backend: PdfDocumentBackend = None  # Internal PDF backend used
+    _backend: AbstractDocumentBackend  # Internal PDF backend used

    def __init__(
        self,
        path_or_stream: Union[BytesIO, Path],
+        format: InputFormat,
+        backend: Type[AbstractDocumentBackend],
        filename: Optional[str] = None,
        limits: Optional[DocumentLimits] = None,
-        pdf_backend=DoclingParseDocumentBackend,
    ):
-        super().__init__()
+        super().__init__(
+            file="", document_hash="", format=InputFormat.PDF
+        )  # initialize with dummy values

        self.limits = limits or DocumentLimits()
+        self.format = format

        try:
            if isinstance(path_or_stream, Path):
@@ -91,11 +105,12 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(
-                        path_or_stream=path_or_stream, document_hash=self.document_hash
-                    )
+                    self._init_doc(backend, path_or_stream)

            elif isinstance(path_or_stream, BytesIO):
+                assert (
+                    filename is not None
+                ), "Can't construct InputDocument from stream without providing filename arg."
                self.file = PurePath(filename)
                self.filesize = path_or_stream.getbuffer().nbytes

@@ -103,15 +118,20 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(
-                        path_or_stream=path_or_stream, document_hash=self.document_hash
-                    )
+                    self._init_doc(backend, path_or_stream)
+            else:
+                raise RuntimeError(
+                    f"Unexpected type path_or_stream: {type(path_or_stream)}"
+                )

-            if self.document_hash and self._backend.page_count() > 0:
-                self.page_count = self._backend.page_count()
-
-                if self.page_count <= self.limits.max_num_pages:
-                    self.valid = True
+            # For paginated backends, check if the maximum page count is exceeded.
+            if self.valid and self._backend.is_valid():
+                if self._backend.supports_pagination() and isinstance(
+                    self._backend, PaginatedDocumentBackend
+                ):
+                    self.page_count = self._backend.page_count()
+                    if not self.page_count <= self.limits.max_num_pages:
+                        self.valid = False

        except (FileNotFoundError, OSError) as e:
            _log.exception(
@@ -125,9 +145,26 @@ class InputDocument(BaseModel):
            )
            # raise

+    def _init_doc(
+        self,
+        backend: Type[AbstractDocumentBackend],
+        path_or_stream: Union[BytesIO, Path],
+    ) -> None:
+        if backend is None:
+            raise RuntimeError(
+                f"No backend configuration provided for file {self.file.name} with format {self.format}. "
+                f"Please check your format configuration on DocumentConverter."
+            )

-@deprecated("Use `ConversionResult` instead.")
-class ConvertedDocument(BaseModel):
+        self._backend = backend(self, path_or_stream=path_or_stream)
+
+
+class DocumentFormat(str, Enum):
+    V2 = "v2"
+    V1 = "v1"
+
+
+class ConversionResult(BaseModel):
    input: InputDocument

    status: ConversionStatus = ConversionStatus.PENDING  # failure, success
@@ -136,15 +173,42 @@ class ConvertedDocument(BaseModel):
    pages: List[Page] = []
    assembled: AssembledUnit = AssembledUnit()

-    output: DsDocument = _EMPTY_DOC
+    document: DoclingDocument = _EMPTY_DOCLING_DOC
+
+    @property
+    @deprecated("Use document instead.")
+    def legacy_document(self):
+        reverse_label_mapping = {
+            DocItemLabel.CAPTION.value: "Caption",
+            DocItemLabel.FOOTNOTE.value: "Footnote",
+            DocItemLabel.FORMULA.value: "Formula",
+            DocItemLabel.LIST_ITEM.value: "List-item",
+            DocItemLabel.PAGE_FOOTER.value: "Page-footer",
+            DocItemLabel.PAGE_HEADER.value: "Page-header",
+            DocItemLabel.PICTURE.value: "Picture",  # low threshold adjust to capture chemical structures for examples.
+            DocItemLabel.SECTION_HEADER.value: "Section-header",
+            DocItemLabel.TABLE.value: "Table",
+            DocItemLabel.TEXT.value: "Text",
+            DocItemLabel.TITLE.value: "Title",
+            DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
+            DocItemLabel.CODE.value: "Code",
+            DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
+            DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
+            DocItemLabel.FORM.value: "Form",
+            DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
+            DocItemLabel.PARAGRAPH.value: "paragraph",
+        }

-    def _to_ds_document(self) -> DsDocument:
        title = ""
        desc = DsDocumentDescription(logs=[])

        page_hashes = [
-            PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
-            for p in self.pages
+            PageReference(
+                hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
+                page=p.page_no,
+                model="default",
+            )
+            for p in self.document.pages.values()
        ]

        file_info = DsFileInfoObject(
@@ -157,145 +221,199 @@ class ConvertedDocument(BaseModel):
        main_text = []
        tables = []
        figures = []
+        equations = []
+        footnotes = []
+        page_headers = []
+        page_footers = []

-        page_no_to_page = {p.page_no: p for p in self.pages}
+        embedded_captions = set()
+        for ix, (item, level) in enumerate(
+            self.document.iterate_items(self.document.body)
+        ):

-        for element in self.assembled.elements:
-            # Convert bboxes to lower-left origin.
-            target_bbox = DsBoundingBox(
-                element.cluster.bbox.to_bottom_left_origin(
-                    page_no_to_page[element.page_no].size.height
-                ).as_tuple()
-            )
+            if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
+                caption = item.caption_text(self.document)
+                if caption:
+                    embedded_captions.add(caption)

-            if isinstance(element, TextElement):
-                main_text.append(
-                    BaseText(
-                        text=element.text,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        name=element.label,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, len(element.text)],
-                            )
-                        ],
-                    )
-                )
-            elif isinstance(element, TableElement):
-                index = len(tables)
-                ref_str = f"#/tables/{index}"
-                main_text.append(
-                    Ref(
-                        name=element.label,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        ref=ref_str,
-                    ),
-                )
+        for item, level in self.document.iterate_items():
+            if isinstance(item, DocItem):
+                item_type = item.label

-                # Initialise empty table data grid (only empty cells)
-                table_data = [
-                    [
-                        TableCell(
-                            text="",
-                            # bbox=[0,0,0,0],
-                            spans=[[i, j]],
-                            obj_type="body",
+                if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
+
+                    if isinstance(item, ListItem) and item.marker:
+                        text = f"{item.marker} {item.text}"
+                    else:
+                        text = item.text
+
+                    # Can be empty.
+                    prov = [
+                        Prov(
+                            bbox=p.bbox.as_tuple(),
+                            page=p.page_no,
+                            span=[0, len(item.text)],
                        )
-                        for j in range(element.num_cols)
+                        for p in item.prov
                    ]
-                    for i in range(element.num_rows)
-                ]
+                    main_text.append(
+                        BaseText(
+                            text=text,
+                            obj_type=layout_label_to_ds_type.get(item.label),
+                            name=reverse_label_mapping[item.label],
+                            prov=prov,
+                        )
+                    )

-                # Overwrite cells in table data for which there is actual cell content.
-                for cell in element.table_cells:
-                    for i in range(
-                        min(cell.start_row_offset_idx, element.num_rows),
-                        min(cell.end_row_offset_idx, element.num_rows),
-                    ):
-                        for j in range(
-                            min(cell.start_col_offset_idx, element.num_cols),
-                            min(cell.end_col_offset_idx, element.num_cols),
+                    # skip captions of they are embedded in the actual
+                    # floating object
+                    if item_type == DocItemLabel.CAPTION and text in embedded_captions:
+                        continue
+
+                elif isinstance(item, TableItem) and item.data:
+                    index = len(tables)
+                    ref_str = f"#/tables/{index}"
+                    main_text.append(
+                        Ref(
+                            name=reverse_label_mapping[item.label],
+                            obj_type=layout_label_to_ds_type.get(item.label),
+                            ref=ref_str,
+                        ),
+                    )
+
+                    # Initialise empty table data grid (only empty cells)
+                    table_data = [
+                        [
+                            TableCell(
+                                text="",
+                                # bbox=[0,0,0,0],
+                                spans=[[i, j]],
+                                obj_type="body",
+                            )
+                            for j in range(item.data.num_cols)
+                        ]
+                        for i in range(item.data.num_rows)
+                    ]
+
+                    # Overwrite cells in table data for which there is actual cell content.
+                    for cell in item.data.table_cells:
+                        for i in range(
+                            min(cell.start_row_offset_idx, item.data.num_rows),
+                            min(cell.end_row_offset_idx, item.data.num_rows),
                        ):
-                            celltype = "body"
-                            if cell.column_header:
-                                celltype = "col_header"
-                            elif cell.row_header:
-                                celltype = "row_header"
-                            elif cell.row_section:
-                                celltype = "row_section"
+                            for j in range(
+                                min(cell.start_col_offset_idx, item.data.num_cols),
+                                min(cell.end_col_offset_idx, item.data.num_cols),
+                            ):
+                                celltype = "body"
+                                if cell.column_header:
+                                    celltype = "col_header"
+                                elif cell.row_header:
+                                    celltype = "row_header"
+                                elif cell.row_section:
+                                    celltype = "row_section"

-                            def make_spans(cell):
-                                for rspan in range(
-                                    min(cell.start_row_offset_idx, element.num_rows),
-                                    min(cell.end_row_offset_idx, element.num_rows),
-                                ):
-                                    for cspan in range(
+                                def make_spans(cell):
+                                    for rspan in range(
                                        min(
-                                            cell.start_col_offset_idx, element.num_cols
+                                            cell.start_row_offset_idx,
+                                            item.data.num_rows,
+                                        ),
+                                        min(
+                                            cell.end_row_offset_idx, item.data.num_rows
                                        ),
-                                        min(cell.end_col_offset_idx, element.num_cols),
                                    ):
-                                        yield [rspan, cspan]
+                                        for cspan in range(
+                                            min(
+                                                cell.start_col_offset_idx,
+                                                item.data.num_cols,
+                                            ),
+                                            min(
+                                                cell.end_col_offset_idx,
+                                                item.data.num_cols,
+                                            ),
+                                        ):
+                                            yield [rspan, cspan]

-                            spans = list(make_spans(cell))
-                            table_data[i][j] = TableCell(
-                                text=cell.text,
-                                bbox=cell.bbox.to_bottom_left_origin(
-                                    page_no_to_page[element.page_no].size.height
-                                ).as_tuple(),
-                                # col=j,
-                                # row=i,
-                                spans=spans,
-                                obj_type=celltype,
-                                # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
-                                # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
-                            )
+                                spans = list(make_spans(cell))
+                                table_data[i][j] = GlmTableCell(
+                                    text=cell.text,
+                                    bbox=(
+                                        cell.bbox.as_tuple()
+                                        if cell.bbox is not None
+                                        else None
+                                    ),  # check if this is bottom-left
+                                    spans=spans,
+                                    obj_type=celltype,
+                                    col=j,
+                                    row=i,
+                                    row_header=cell.row_header,
+                                    row_section=cell.row_section,
+                                    col_header=cell.column_header,
+                                    row_span=[
+                                        cell.start_row_offset_idx,
+                                        cell.end_row_offset_idx,
+                                    ],
+                                    col_span=[
+                                        cell.start_col_offset_idx,
+                                        cell.end_col_offset_idx,
+                                    ],
+                                )

-                tables.append(
-                    DsSchemaTable(
-                        num_cols=element.num_cols,
-                        num_rows=element.num_rows,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        data=table_data,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
+                    # Compute the caption
+                    caption = item.caption_text(self.document)
+
+                    tables.append(
+                        DsSchemaTable(
+                            text=caption,
+                            num_cols=item.data.num_cols,
+                            num_rows=item.data.num_rows,
+                            obj_type=layout_label_to_ds_type.get(item.label),
+                            data=table_data,
+                            prov=[
+                                Prov(
+                                    bbox=p.bbox.as_tuple(),
+                                    page=p.page_no,
+                                    span=[0, 0],
+                                )
+                                for p in item.prov
+                            ],
+                        )
                    )
-                )

-            elif isinstance(element, FigureElement):
-                index = len(figures)
-                ref_str = f"#/figures/{index}"
-                main_text.append(
-                    Ref(
-                        name=element.label,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        ref=ref_str,
-                    ),
-                )
-                figures.append(
-                    Figure(
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        # data=[[]],
+                elif isinstance(item, PictureItem):
+                    index = len(figures)
+                    ref_str = f"#/figures/{index}"
+                    main_text.append(
+                        Ref(
+                            name=reverse_label_mapping[item.label],
+                            obj_type=layout_label_to_ds_type.get(item.label),
+                            ref=ref_str,
+                        ),
+                    )
+
+                    # Compute the caption
+                    caption = item.caption_text(self.document)
+
+                    figures.append(
+                        Figure(
+                            prov=[
+                                Prov(
+                                    bbox=p.bbox.as_tuple(),
+                                    page=p.page_no,
+                                    span=[0, len(caption)],
+                                )
+                                for p in item.prov
+                            ],
+                            obj_type=layout_label_to_ds_type.get(item.label),
+                            text=caption,
+                            # data=[[]],
+                        )
                    )
-                )

        page_dimensions = [
-            PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
-            for p in self.pages
+            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
+            for p in self.document.pages.values()
        ]

        ds_doc = DsDocument(
@@ -303,6 +421,10 @@ class ConvertedDocument(BaseModel):
            description=desc,
            file_info=file_info,
            main_text=main_text,
+            equations=equations,
+            footnotes=footnotes,
+            page_headers=page_headers,
+            page_footers=page_footers,
            tables=tables,
            figures=figures,
            page_dimensions=page_dimensions,
@@ -310,152 +432,76 @@ class ConvertedDocument(BaseModel):

        return ds_doc

-    def render_as_dict(self):
-        return self.output.model_dump(by_alias=True, exclude_none=True)

-    def render_as_markdown(
-        self,
-        delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-            "figure",
-        ],
-        strict_text: bool = False,
-        image_placeholder: str = "<!-- image -->",
-    ):
-        return self.output.export_to_markdown(
-            delim=delim,
-            main_text_start=main_text_start,
-            main_text_stop=main_text_stop,
-            main_text_labels=main_text_labels,
-            strict_text=strict_text,
-            image_placeholder=image_placeholder,
-        )
+class _DocumentConversionInput(BaseModel):

-    def render_as_text(
-        self,
-        delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-        ],
-    ):
-        return self.output.export_to_markdown(
-            delim=delim,
-            main_text_start=main_text_start,
-            main_text_stop=main_text_stop,
-            main_text_labels=main_text_labels,
-            strict_text=True,
-        )
-
-    def render_as_doctags(
-        self,
-        delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-            "figure",
-        ],
-        xsize: int = 100,
-        ysize: int = 100,
-        add_location: bool = True,
-        add_content: bool = True,
-        add_page_index: bool = True,
-        # table specific flags
-        add_table_cell_location: bool = False,
-        add_table_cell_label: bool = True,
-        add_table_cell_text: bool = True,
-    ) -> str:
-        return self.output.export_to_document_tokens(
-            delim=delim,
-            main_text_start=main_text_start,
-            main_text_stop=main_text_stop,
-            main_text_labels=main_text_labels,
-            xsize=xsize,
-            ysize=ysize,
-            add_location=add_location,
-            add_content=add_content,
-            add_page_index=add_page_index,
-            # table specific flags
-            add_table_cell_location=add_table_cell_location,
-            add_table_cell_label=add_table_cell_label,
-            add_table_cell_text=add_table_cell_text,
-        )
-
-    def render_element_images(
-        self, element_types: Tuple[PageElement] = (FigureElement,)
-    ):
-        for element in self.assembled.elements:
-            if isinstance(element, element_types):
-                page_ix = element.page_no
-                scale = self.pages[page_ix]._default_image_scale
-                crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
-                    page_height=self.pages[page_ix].size.height * scale
-                )
-
-                cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
-                yield element, cropped_im
-
-
-class ConversionResult(ConvertedDocument):
-    pass
-
-
-class DocumentConversionInput(BaseModel):
-
-    _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
+    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
    limits: Optional[DocumentLimits] = DocumentLimits()

-    DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
-
    def docs(
-        self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
+        self, format_options: Dict[InputFormat, "FormatOption"]
    ) -> Iterable[InputDocument]:
+        for item in self.path_or_stream_iterator:
+            obj = resolve_file_source(item) if isinstance(item, str) else item
+            format = self._guess_format(obj)
+            if format not in format_options.keys():
+                _log.info(
+                    f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
+                )
+                continue
+            else:
+                backend = format_options[format].backend

-        pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
-
-        for obj in self._path_or_stream_iterator:
            if isinstance(obj, Path):
                yield InputDocument(
-                    path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
+                    path_or_stream=obj,
+                    format=format,
+                    filename=obj.name,
+                    limits=self.limits,
+                    backend=backend,
                )
            elif isinstance(obj, DocumentStream):
                yield InputDocument(
                    path_or_stream=obj.stream,
-                    filename=obj.filename,
+                    format=format,
+                    filename=obj.name,
                    limits=self.limits,
-                    pdf_backend=pdf_backend,
+                    backend=backend,
                )
+            else:
+                raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")

-    @classmethod
-    def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
-        paths = [Path(p) for p in paths]
+    def _guess_format(self, obj):
+        content = None
+        if isinstance(obj, Path):
+            mime = filetype.guess_mime(str(obj))
+            if mime is None:
+                with obj.open("rb") as f:
+                    content = f.read(1024)  # Read first 1KB

-        doc_input = cls(limits=limits)
-        doc_input._path_or_stream_iterator = paths
+        elif isinstance(obj, DocumentStream):
+            obj.stream.seek(0)
+            content = obj.stream.read(8192)
+            obj.stream.seek(0)
+            mime = filetype.guess_mime(content)

-        return doc_input
+        if mime is None:
+            mime = self._detect_html_xhtml(content)

-    @classmethod
-    def from_streams(
-        cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
-    ):
-        doc_input = cls(limits=limits)
-        doc_input._path_or_stream_iterator = streams
+        format = MimeTypeToFormat.get(mime)
+        return format

-        return doc_input
+    def _detect_html_xhtml(self, content):
+        content_str = content.decode("ascii", errors="ignore").lower()
+        # Remove XML comments
+        content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
+        content_str = content_str.lstrip()
+
+        if re.match(r"<\?xml", content_str):
+            if "xhtml" in content_str[:1000]:
+                return "application/xhtml+xml"
+
+        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
+            return "text/html"
+
+        return None
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -1,4 +1,5 @@
 from enum import Enum, auto
+from pathlib import Path
 from typing import List, Literal, Optional, Union

 from pydantic import BaseModel, ConfigDict, Field
@@ -58,6 +59,13 @@ class TesseractOcrOptions(OcrOptions):


 class PipelineOptions(BaseModel):
+    create_legacy_output: bool = (
+        True  # This defautl will be set to False on a future version of docling
+    )
+
+
+class PdfPipelineOptions(PipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text

@@ -65,3 +73,8 @@ class PipelineOptions(BaseModel):
    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
        Field(EasyOcrOptions(), discriminator="kind")
    )
+
+    images_scale: float = 1.0
+    generate_page_images: bool = False
+    generate_picture_images: bool = False
+    generate_table_images: bool = False
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@@ -14,6 +14,7 @@ class BatchConcurrencySettings(BaseModel):
    doc_batch_concurrency: int = 2
    page_batch_size: int = 4
    page_batch_concurrency: int = 2
+    elements_batch_size: int = 16

    # doc_batch_size: int = 1
    # doc_batch_concurrency: int = 1