feat!: Docling v2 (#117)

--------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-12-08 20:58:11 +00:00 · 2024-10-16 21:02:03 +02:00
parent d504432c1e
commit 7d3be0edeb
144 changed files with 15180 additions and 3828 deletions
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,18 +1,19 @@
-import copy
-import warnings
 from enum import Enum, auto
 from io import BytesIO
-from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union

-from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict, Field, model_validator
-from typing_extensions import Self
-
-from docling.backend.abstract_backend import PdfPageBackend
-from docling.datamodel.pipeline_options import (  # Must be imported here for backward compatibility.
-    PipelineOptions,
-    TableStructureOptions,
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItemLabel,
+    PictureDataType,
+    Size,
+    TableCell,
 )
+from PIL.Image import Image
+from pydantic import BaseModel, ConfigDict
+
+if TYPE_CHECKING:
+    from docling.backend.pdf_backend import PdfPageBackend


 class ConversionStatus(str, Enum):
@@ -23,18 +24,61 @@ class ConversionStatus(str, Enum):
    PARTIAL_SUCCESS = auto()


+class InputFormat(str, Enum):
+    DOCX = "docx"
+    PPTX = "pptx"
+    HTML = "html"
+    IMAGE = "image"
+    PDF = "pdf"
+
+
+class OutputFormat(str, Enum):
+    MARKDOWN = "md"
+    JSON = "json"
+    TEXT = "text"
+    DOCTAGS = "doctags"
+
+
+FormatToExtensions: Dict[InputFormat, List[str]] = {
+    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
+    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
+    InputFormat.PDF: ["pdf"],
+    InputFormat.HTML: ["html", "htm", "xhtml"],
+    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
+}
+
+FormatToMimeType: Dict[InputFormat, Set[str]] = {
+    InputFormat.DOCX: {
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+    },
+    InputFormat.PPTX: {
+        "application/vnd.openxmlformats-officedocument.presentationml.template",
+        "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    },
+    InputFormat.HTML: {"text/html", "application/xhtml+xml"},
+    InputFormat.IMAGE: {
+        "image/png",
+        "image/jpeg",
+        "image/tiff",
+        "image/gif",
+        "image/bmp",
+    },
+    InputFormat.PDF: {"application/pdf"},
+}
+MimeTypeToFormat = {
+    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
+}
+
+
 class DocInputType(str, Enum):
    PATH = auto()
    STREAM = auto()


-class CoordOrigin(str, Enum):
-    TOPLEFT = auto()
-    BOTTOMLEFT = auto()
-
-
 class DoclingComponentType(str, Enum):
-    PDF_BACKEND = auto()
+    DOCUMENT_BACKEND = auto()
    MODEL = auto()
    DOC_ASSEMBLER = auto()

@@ -45,118 +89,6 @@ class ErrorItem(BaseModel):
    error_message: str


-class PageSize(BaseModel):
-    width: float = 0.0
-    height: float = 0.0
-
-
-class BoundingBox(BaseModel):
-    l: float  # left
-    t: float  # top
-    r: float  # right
-    b: float  # bottom
-
-    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
-
-    @property
-    def width(self):
-        return self.r - self.l
-
-    @property
-    def height(self):
-        return abs(self.t - self.b)
-
-    def scaled(self, scale: float) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l *= scale
-        out_bbox.r *= scale
-        out_bbox.t *= scale
-        out_bbox.b *= scale
-
-        return out_bbox
-
-    def normalized(self, page_size: PageSize) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l /= page_size.width
-        out_bbox.r /= page_size.width
-        out_bbox.t /= page_size.height
-        out_bbox.b /= page_size.height
-
-        return out_bbox
-
-    def as_tuple(self):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return (self.l, self.t, self.r, self.b)
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return (self.l, self.b, self.r, self.t)
-
-    @classmethod
-    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
-        if origin == CoordOrigin.TOPLEFT:
-            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b < t:
-                b, t = t, b
-
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-        elif origin == CoordOrigin.BOTTOMLEFT:
-            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b > t:
-                b, t = t, b
-
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-
-    def area(self) -> float:
-        area = (self.r - self.l) * (self.b - self.t)
-        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            area = -area
-        return area
-
-    def intersection_area_with(self, other: "BoundingBox") -> float:
-        # Calculate intersection coordinates
-        left = max(self.l, other.l)
-        top = max(self.t, other.t)
-        right = min(self.r, other.r)
-        bottom = min(self.b, other.b)
-
-        # Calculate intersection dimensions
-        width = right - left
-        height = bottom - top
-
-        # If the bounding boxes do not overlap, width or height will be negative
-        if width <= 0 or height <= 0:
-            return 0.0
-
-        return width * height
-
-    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
-        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.TOPLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,
-                b=page_height - self.b,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            )
-
-    def to_top_left_origin(self, page_height):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,  # self.b
-                b=page_height - self.b,  # self.t
-                coord_origin=CoordOrigin.TOPLEFT,
-            )
-
-
 class Cell(BaseModel):
    id: int
    text: str
@@ -169,14 +101,14 @@ class OcrCell(Cell):

 class Cluster(BaseModel):
    id: int
-    label: str
+    label: DocItemLabel
    bbox: BoundingBox
    confidence: float = 1.0
    cells: List[Cell] = []


 class BasePageElement(BaseModel):
-    label: str
+    label: DocItemLabel
    id: int
    page_no: int
    cluster: Cluster
@@ -187,37 +119,7 @@ class LayoutPrediction(BaseModel):
    clusters: List[Cluster] = []


-class TableCell(BaseModel):
-    bbox: BoundingBox
-    row_span: int
-    col_span: int
-    start_row_offset_idx: int
-    end_row_offset_idx: int
-    start_col_offset_idx: int
-    end_col_offset_idx: int
-    text: str
-    column_header: bool = False
-    row_header: bool = False
-    row_section: bool = False
-
-    @model_validator(mode="before")
-    @classmethod
-    def from_dict_format(cls, data: Any) -> Any:
-        if isinstance(data, Dict):
-            text = data["bbox"].get("token", "")
-            if not len(text):
-                text_cells = data.pop("text_cell_bboxes", None)
-                if text_cells:
-                    for el in text_cells:
-                        text += el["token"] + " "
-
-                text = text.strip()
-            data["text"] = text
-
-        return data
-
-
-class TableElement(BasePageElement):
+class Table(BasePageElement):
    otsl_seq: List[str]
    num_rows: int = 0
    num_cols: int = 0
@@ -225,18 +127,15 @@ class TableElement(BasePageElement):


 class TableStructurePrediction(BaseModel):
-    table_map: Dict[int, TableElement] = {}
+    table_map: Dict[int, Table] = {}


-class TextElement(BasePageElement): ...
-
-
-class FigureData(BaseModel):
-    pass
+class TextElement(BasePageElement):
+    text: str


 class FigureElement(BasePageElement):
-    data: Optional[FigureData] = None
+    annotations: List[PictureDataType] = []
    provenance: Optional[str] = None
    predicted_class: Optional[str] = None
    confidence: Optional[float] = None
@@ -259,7 +158,7 @@ class PagePredictions(BaseModel):
    equations_prediction: Optional[EquationPrediction] = None


-PageElement = Union[TextElement, TableElement, FigureElement]
+PageElement = Union[TextElement, Table, FigureElement]


 class AssembledUnit(BaseModel):
@@ -272,13 +171,13 @@ class Page(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    page_no: int
-    page_hash: Optional[str] = None
-    size: Optional[PageSize] = None
+    # page_hash: Optional[str] = None
+    size: Optional[Size] = None
    cells: List[Cell] = []
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None

-    _backend: Optional[PdfPageBackend] = (
+    _backend: Optional["PdfPageBackend"] = (
        None  # Internal PDF backend. By default it is cleared during assembling.
    )
    _default_image_scale: float = 1.0  # Default image scale for external usage.
@@ -301,24 +200,5 @@ class Page(BaseModel):
 class DocumentStream(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

-    filename: str
+    name: str
    stream: BytesIO
-
-
-class AssembleOptions(BaseModel):
-    keep_page_images: Annotated[
-        bool,
-        Field(
-            deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
-        ),
-    ] = False  # False: page images are removed in the assemble step
-    images_scale: Optional[float] = None  # if set, the scale for generated images
-
-    @model_validator(mode="after")
-    def set_page_images_from_deprecated(self) -> Self:
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", DeprecationWarning)
-            default_scale = 1.0
-            if self.keep_page_images and self.images_scale is None:
-                self.images_scale = default_scale
-        return self