Merge branch 'cau/experimental-format' of github.com:DS4SD/docling into cau/input-format-abstraction

2025-12-16 08:38:14 +00:00 · 2024-09-30 13:47:57 +02:00
parent 95c539579d 0a86529afb
commit cd06d89c2a
30 changed files with 979 additions and 784 deletions
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@@ -3,7 +3,7 @@ from io import BytesIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Iterable, Optional, Union

-from docling_core.types.experimental.base import BoundingBox, Size
+from docling_core.types.experimental import BoundingBox, Size
 from PIL import Image

 if TYPE_CHECKING:
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@@ -5,7 +5,7 @@ from pathlib import Path
 from typing import Iterable, List, Optional, Union

 import pypdfium2 as pdfium
-from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
+from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
 from docling_parse.docling_parse import pdf_parser
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union

 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
-from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
+from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage, PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -9,7 +9,6 @@ from typing import Annotated, Iterable, List, Optional

 import typer
 from docling_core.utils.file import resolve_file_source
-from pydantic import AnyUrl

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -181,58 +180,25 @@ def convert(
        else:
            input_doc_paths.append(source)

-    ###########################################################################
+    match backend:
+        case Backend.PYPDFIUM2:
+            do_cell_matching = ocr  # only do cell matching when OCR enabled
+            pdf_backend = PyPdfiumDocumentBackend
+        case Backend.DOCLING:
+            do_cell_matching = True
+            pdf_backend = DoclingParseDocumentBackend
+        case _:
+            raise RuntimeError(f"Unexpected backend type {backend}")

-    # The following sections contain a combination of PipelineOptions
-    # and PDF Backends for various configurations.
-    # Uncomment one section at the time to see the differences in the output.
-
-    doc_converter = None
-    if backend == Backend.PYPDFIUM2 and not ocr:  # PyPdfium without OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = False
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=PyPdfiumDocumentBackend,
-        )
-
-    elif backend == Backend.PYPDFIUM2.value and ocr:  # PyPdfium with OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=PyPdfiumDocumentBackend,
-        )
-
-    elif backend == Backend.DOCLING.value and not ocr:  # Docling Parse without OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=DoclingParseDocumentBackend,
-        )
-
-    elif backend == Backend.DOCLING.value and ocr:  # Docling Parse with OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = True
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=DoclingParseDocumentBackend,
-        )
-
-    ###########################################################################
+    pipeline_options = PipelineOptions(
+        do_ocr=ocr,
+        do_table_structure=True,
+    )
+    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=pdf_backend,
+    )

    # Define input files
    input = DocumentConversionInput.from_paths(input_doc_paths)
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -4,9 +4,9 @@ from enum import Enum, auto
 from io import BytesIO
 from typing import Annotated, Any, Dict, List, Optional, Tuple, Union

-from docling_core.types.experimental.base import BoundingBox, Size
-from docling_core.types.experimental.document import BaseFigureData, TableCell
-from docling_core.types.experimental.labels import PageLabel
+from docling_core.types.experimental import BoundingBox, Size
+from docling_core.types.experimental.document import BasePictureData, TableCell
+from docling_core.types.experimental.labels import DocItemLabel
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Self
@@ -59,14 +59,14 @@ class OcrCell(Cell):

 class Cluster(BaseModel):
    id: int
-    label: PageLabel
+    label: DocItemLabel
    bbox: BoundingBox
    confidence: float = 1.0
    cells: List[Cell] = []


 class BasePageElement(BaseModel):
-    label: PageLabel
+    label: DocItemLabel
    id: int
    page_no: int
    cluster: Cluster
@@ -92,7 +92,7 @@ class TextElement(BasePageElement): ...


 class FigureElement(BasePageElement):
-    data: Optional[BaseFigureData] = None
+    data: Optional[BasePictureData] = None
    provenance: Optional[str] = None
    predicted_class: Optional[str] = None
    confidence: Optional[float] = None
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -11,8 +11,11 @@ from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types.doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.doc.base import Figure, TableCell
-from docling_core.types.experimental.document import DoclingDocument, FileInfo
-from docling_core.types.experimental.labels import PageLabel
+from docling_core.types.experimental import (
+    DescriptionItem,
+    DocItemLabel,
+    DoclingDocument,
+)
 from pydantic import BaseModel
 from typing_extensions import deprecated

@@ -36,21 +39,21 @@ from docling.utils.utils import create_file_hash
 _log = logging.getLogger(__name__)

 layout_label_to_ds_type = {
-    PageLabel.TITLE: "title",
-    PageLabel.DOCUMENT_INDEX: "table-of-contents",
-    PageLabel.SECTION_HEADER: "subtitle-level-1",
-    PageLabel.CHECKBOX_SELECTED: "checkbox-selected",
-    PageLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
-    PageLabel.CAPTION: "caption",
-    PageLabel.PAGE_HEADER: "page-header",
-    PageLabel.PAGE_FOOTER: "page-footer",
-    PageLabel.FOOTNOTE: "footnote",
-    PageLabel.TABLE: "table",
-    PageLabel.FORMULA: "equation",
-    PageLabel.LIST_ITEM: "paragraph",
-    PageLabel.CODE: "paragraph",
-    PageLabel.PICTURE: "figure",
-    PageLabel.TEXT: "paragraph",
+    DocItemLabel.TITLE: "title",
+    DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
+    DocItemLabel.SECTION_HEADER: "subtitle-level-1",
+    DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
+    DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
+    DocItemLabel.CAPTION: "caption",
+    DocItemLabel.PAGE_HEADER: "page-header",
+    DocItemLabel.PAGE_FOOTER: "page-footer",
+    DocItemLabel.FOOTNOTE: "footnote",
+    DocItemLabel.TABLE: "table",
+    DocItemLabel.FORMULA: "equation",
+    DocItemLabel.LIST_ITEM: "paragraph",
+    DocItemLabel.CODE: "paragraph",
+    DocItemLabel.PICTURE: "figure",
+    DocItemLabel.TEXT: "paragraph",
 }

 _EMPTY_DOC = DsDocument(
@@ -63,7 +66,7 @@ _EMPTY_DOC = DsDocument(
 )

 _EMPTY_DOCLING_DOC = DoclingDocument(
-    description={}, file_info=FileInfo(document_hash="123xyz")
+    description=DescriptionItem(), name="dummy"
 )  # TODO: Stub

 _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
@@ -341,8 +344,10 @@ class ConvertedDocument(BaseModel):
            "paragraph",
            "caption",
            "table",
+            "figure",
        ],
        strict_text: bool = False,
+        image_placeholder: str = "<!-- image -->",
    ):
        return self.output.export_to_markdown(
            delim=delim,
@@ -350,6 +355,7 @@ class ConvertedDocument(BaseModel):
            main_text_stop=main_text_stop,
            main_text_labels=main_text_labels,
            strict_text=strict_text,
+            image_placeholder=image_placeholder,
        )

    def render_as_text(
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -5,7 +5,7 @@ from typing import Iterable, List, Tuple

 import numpy
 import numpy as np
-from docling_core.types.experimental.base import BoundingBox, CoordOrigin
+from docling_core.types.experimental import BoundingBox, CoordOrigin
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import find_objects, label
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@@ -11,7 +11,7 @@ from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_mode
 from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
 from docling_core.types import Ref
-from docling_core.types.experimental.base import BoundingBox, CoordOrigin
+from docling_core.types.experimental import BoundingBox, CoordOrigin
 from docling_core.types.experimental.document import DoclingDocument
 from PIL import ImageDraw

@@ -57,7 +57,7 @@ class GlmModel:
                    if arr == "tables":
                        prov = ds_document.tables[index].prov[0]
                    elif arr == "figures":
-                        prov = ds_document.figures[index].prov[0]
+                        prov = ds_document.pictures[index].prov[0]
                    else:
                        prov = None

--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -2,7 +2,7 @@ import logging
 from typing import Iterable

 import numpy
-from docling_core.types.experimental.base import BoundingBox, CoordOrigin
+from docling_core.types.experimental import BoundingBox, CoordOrigin

 from docling.datamodel.base_models import OcrCell, Page
 from docling.models.base_ocr_model import BaseOcrModel
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -4,8 +4,8 @@ import random
 import time
 from typing import Iterable, List

-from docling_core.types.experimental.base import CoordOrigin
-from docling_core.types.experimental.labels import PageLabel
+from docling_core.types.experimental import CoordOrigin
+from docling_core.types.experimental.labels import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import ImageDraw

@@ -24,23 +24,23 @@ _log = logging.getLogger(__name__)
 class LayoutModel:

    TEXT_ELEM_LABELS = [
-        PageLabel.TEXT,
-        PageLabel.FOOTNOTE,
-        PageLabel.CAPTION,
-        PageLabel.CHECKBOX_UNSELECTED,
-        PageLabel.CHECKBOX_SELECTED,
-        PageLabel.SECTION_HEADER,
-        PageLabel.PAGE_HEADER,
-        PageLabel.PAGE_FOOTER,
-        PageLabel.CODE,
-        PageLabel.LIST_ITEM,
+        DocItemLabel.TEXT,
+        DocItemLabel.FOOTNOTE,
+        DocItemLabel.CAPTION,
+        DocItemLabel.CHECKBOX_UNSELECTED,
+        DocItemLabel.CHECKBOX_SELECTED,
+        DocItemLabel.SECTION_HEADER,
+        DocItemLabel.PAGE_HEADER,
+        DocItemLabel.PAGE_FOOTER,
+        DocItemLabel.CODE,
+        DocItemLabel.LIST_ITEM,
        # "Formula",
    ]
-    PAGE_HEADER_LABELS = [PageLabel.PAGE_HEADER, PageLabel.PAGE_FOOTER]
+    PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]

-    TABLE_LABEL = PageLabel.TABLE
-    FIGURE_LABEL = PageLabel.PICTURE
-    FORMULA_LABEL = PageLabel.FORMULA
+    TABLE_LABEL = DocItemLabel.TABLE
+    FIGURE_LABEL = DocItemLabel.PICTURE
+    FORMULA_LABEL = DocItemLabel.FORMULA

    def __init__(self, config):
        self.config = config
@@ -51,27 +51,27 @@ class LayoutModel:
    def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
        MIN_INTERSECTION = 0.2
        CLASS_THRESHOLDS = {
-            PageLabel.CAPTION: 0.35,
-            PageLabel.FOOTNOTE: 0.35,
-            PageLabel.FORMULA: 0.35,
-            PageLabel.LIST_ITEM: 0.35,
-            PageLabel.PAGE_FOOTER: 0.35,
-            PageLabel.PAGE_HEADER: 0.35,
-            PageLabel.PICTURE: 0.2,  # low threshold adjust to capture chemical structures for examples.
-            PageLabel.SECTION_HEADER: 0.45,
-            PageLabel.TABLE: 0.35,
-            PageLabel.TEXT: 0.45,
-            PageLabel.TITLE: 0.45,
-            PageLabel.DOCUMENT_INDEX: 0.45,
-            PageLabel.CODE: 0.45,
-            PageLabel.CHECKBOX_SELECTED: 0.45,
-            PageLabel.CHECKBOX_UNSELECTED: 0.45,
-            PageLabel.FORM: 0.45,
-            PageLabel.KEY_VALUE_REGION: 0.45,
+            DocItemLabel.CAPTION: 0.35,
+            DocItemLabel.FOOTNOTE: 0.35,
+            DocItemLabel.FORMULA: 0.35,
+            DocItemLabel.LIST_ITEM: 0.35,
+            DocItemLabel.PAGE_FOOTER: 0.35,
+            DocItemLabel.PAGE_HEADER: 0.35,
+            DocItemLabel.PICTURE: 0.2,  # low threshold adjust to capture chemical structures for examples.
+            DocItemLabel.SECTION_HEADER: 0.45,
+            DocItemLabel.TABLE: 0.35,
+            DocItemLabel.TEXT: 0.45,
+            DocItemLabel.TITLE: 0.45,
+            DocItemLabel.DOCUMENT_INDEX: 0.45,
+            DocItemLabel.CODE: 0.45,
+            DocItemLabel.CHECKBOX_SELECTED: 0.45,
+            DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
+            DocItemLabel.FORM: 0.45,
+            DocItemLabel.KEY_VALUE_REGION: 0.45,
        }

        CLASS_REMAPPINGS = {
-            PageLabel.DOCUMENT_INDEX: PageLabel.TABLE,
+            DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
        }

        _log.debug("================= Start postprocess function ====================")
@@ -258,7 +258,7 @@ class LayoutModel:
                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
                ).to_top_left_origin(page_height),
                confidence=c["confidence"],
-                label=PageLabel(c["type"]),
+                label=DocItemLabel(c["type"]),
                cells=cluster_cells,
            )
            clusters_out_new.append(c_new)
@@ -271,7 +271,7 @@ class LayoutModel:
            for ix, pred_item in enumerate(
                self.layout_predictor.predict(page.get_image(scale=1.0))
            ):
-                label = PageLabel(
+                label = DocItemLabel(
                    pred_item["label"].lower().replace(" ", "_").replace("-", "_")
                )  # Temporary, until docling-ibm-model uses docling-core types
                cluster = Cluster(
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -2,8 +2,9 @@ import copy
 from typing import Iterable, List

 import numpy
-from docling_core.types.experimental.base import BoundingBox
+from docling_core.types.experimental import BoundingBox
 from docling_core.types.experimental.document import TableCell
+from docling_core.types.experimental.labels import DocItemLabel
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw

@@ -74,7 +75,7 @@ class TableStructureModel:
                    ],
                )
                for cluster in page.predictions.layout.clusters
-                if cluster.label == "Table"
+                if cluster.label == DocItemLabel.TABLE
            ]
            if not len(in_tables):
                yield page
@@ -138,7 +139,7 @@ class TableStructureModel:
                        id=table_cluster.id,
                        page_no=page.page_no,
                        cluster=table_cluster,
-                        label="Table",
+                        label=DocItemLabel.TABLE,
                    )

                    page.predictions.tablestructure.table_map[table_cluster.id] = tbl
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@@ -2,7 +2,7 @@ import logging
 from typing import Any, Dict, Iterable, List, Tuple, Union

 from docling_core.types.doc.base import BaseCell, BaseText, BoundingBox, Ref, Table
-from docling_core.types.experimental.base import CoordOrigin
+from docling_core.types.experimental import CoordOrigin

 from docling.datamodel.base_models import OcrCell
 from docling.datamodel.document import ConversionResult, Page
--- a/docling/utils/layout_utils.py
+++ b/docling/utils/layout_utils.py
@@ -2,7 +2,7 @@ import copy
 import logging

 import networkx as nx
-from docling_core.types.experimental.labels import PageLabel
+from docling_core.types.experimental.labels import DocItemLabel

 logger = logging.getLogger("layout_utils")

@@ -371,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
            "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
        )
        logger.debug("  with cells: " + str(new_cluster["cell_ids"]))
-        if len(cluster["cell_ids"]) == 0 and cluster["type"] != PageLabel.PICTURE:
+        if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
            logger.debug("  Empty non-picture, removed")
            continue  ## Skip this former cluster, now without cells.
        new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -381,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):


 def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
-    if not (cluster["type"] in [PageLabel.TABLE, PageLabel.PICTURE]):
+    if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
        ## A text-like cluster. The bbox only needs to be around the text cells:
        logger.debug("    Initial bbox: " + str(cluster["bbox"]))
        new_bbox = surrounding_list(
            [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
        )
        logger.debug("  New bounding box:" + str(new_bbox))
-    if cluster["type"] == PageLabel.PICTURE:
+    if cluster["type"] == DocItemLabel.PICTURE:
        ## We only make the bbox completely comprise included text cells:
        logger.debug("  Picture")
        if len(cluster["cell_ids"]) != 0:
@@ -588,7 +588,7 @@ def set_orphan_as_text(
    max_id = -1
    figures = []
    for cluster in cluster_predictions:
-        if cluster["type"] == PageLabel.PICTURE:
+        if cluster["type"] == DocItemLabel.PICTURE:
            figures.append(cluster)

        if cluster["id"] > max_id:
@@ -639,13 +639,13 @@ def set_orphan_as_text(
            # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
            if fig_flag == False and lines_detector == False:
                # get class from low confidence detections if not set as text:
-                class_type = PageLabel.TEXT
+                class_type = DocItemLabel.TEXT

                for cluster in cluster_predictions_low:
                    intersection = compute_intersection(
                        orph_cell["bbox"], cluster["bbox"]
                    )
-                    class_type = PageLabel.TEXT
+                    class_type = DocItemLabel.TEXT
                    if (
                        cluster["confidence"] > 0.1
                        and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -719,7 +719,9 @@ def merge_cells(cluster_predictions):
                    if cluster["id"] == node:
                        lines.append(cluster)
                        cluster_predictions.remove(cluster)
-            new_merged_cluster = build_cluster_from_lines(lines, PageLabel.TEXT, max_id)
+            new_merged_cluster = build_cluster_from_lines(
+                lines, DocItemLabel.TEXT, max_id
+            )
            cluster_predictions.append(new_merged_cluster)
    return cluster_predictions

@@ -754,9 +756,9 @@ def clean_up_clusters(
                # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
                elif img_table == True:
                    if (
-                        cluster_1["type"] == PageLabel.TEXT
-                        and cluster_2["type"] == PageLabel.PICTURE
-                        or cluster_2["type"] == PageLabel.TABLE
+                        cluster_1["type"] == DocItemLabel.TEXT
+                        and cluster_2["type"] == DocItemLabel.PICTURE
+                        or cluster_2["type"] == DocItemLabel.TABLE
                    ):
                        if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
                            DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -773,7 +775,7 @@ def clean_up_clusters(
            # remove tables that have one pdf cell
            if one_cell_table == True:
                if (
-                    cluster_1["type"] == PageLabel.TABLE
+                    cluster_1["type"] == DocItemLabel.TABLE
                    and len(cluster_1["cell_ids"]) < 2
                ):
                    DuplicateDeletedClusterIDs.append(cluster_1["id"])