Documentation updates, remove DescriptionItem in DoclingDocument init

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-16 16:05:49 +02:00 · 2024-10-16 16:05:49 +02:00 · 734d77c8ae
commit 734d77c8ae
parent 07206c5b3e
31 changed files with 154 additions and 100 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -5,12 +5,12 @@ from typing import Set, Union

 from bs4 import BeautifulSoup
 from docling_core.types.doc import (
-    DescriptionItem,
+    DocItemLabel,
    DoclingDocument,
+    GroupLabel,
    TableCell,
    TableData,
 )
-from docling_core.types.doc.labels import DocItemLabel, GroupLabel

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
-        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
+        doc = DoclingDocument(name="dummy")
        _log.debug("Trying to convert HTML...")

        if self.is_valid():
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -4,16 +4,17 @@ from pathlib import Path
 from typing import Set, Union

 from docling_core.types.doc import (
-    DescriptionItem,
+    BoundingBox,
+    CoordOrigin,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
    GroupLabel,
    ProvenanceItem,
+    Size,
    TableCell,
    TableData,
 )
-from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER

@ -96,7 +97,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        else:
            docname = "stream"
        doc = DoclingDocument(
-            description=DescriptionItem(), name=docname, origin=origin
+            name=docname, origin=origin
        )  # must add origin information
        doc = self.walk_linear(self.pptx_obj, doc)

--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -5,7 +5,6 @@ from typing import Set, Union

 import docx
 from docling_core.types.doc import (
-    DescriptionItem,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
@ -99,9 +98,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            docname = Path(fname).stem
        else:
            docname = "stream"
-        doc = DoclingDocument(
-            description=DescriptionItem(), name=docname, origin=origin
-        )
+        doc = DoclingDocument(name=docname, origin=origin)
        if self.is_valid():
            assert self.docx_obj is not None
            doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -2,9 +2,13 @@ from enum import Enum, auto
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union

-from docling_core.types.doc import BoundingBox, Size
-from docling_core.types.doc.document import PictureDataType, TableCell
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItemLabel,
+    PictureDataType,
+    Size,
+    TableCell,
+)
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict

--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -13,7 +13,6 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types.doc import (
-    DescriptionItem,
    DocItem,
    DocItemLabel,
    DoclingDocument,
@ -37,11 +36,9 @@ from docling.datamodel.base_models import (
    ConversionStatus,
    DocumentStream,
    ErrorItem,
-    FigureElement,
    InputFormat,
    MimeTypeToFormat,
    Page,
-    PageElement,
 )
 from docling.datamodel.settings import DocumentLimits
 from docling.utils.utils import create_file_hash, create_hash
@ -70,9 +67,7 @@ layout_label_to_ds_type = {
    DocItemLabel.PARAGRAPH: "paragraph",
 }

-_EMPTY_DOCLING_DOC = DoclingDocument(
-    description=DescriptionItem(), name="dummy"
-)  # TODO: Stub
+_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")


 class InputDocument(BaseModel):
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -11,8 +11,7 @@ from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
-from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.document import DoclingDocument
+from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
 from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.legacy_doc.base import Figure, TableCell
 from PIL import ImageDraw
@ -222,15 +221,8 @@ class GlmModel:
        ds_doc_dict = ds_doc.model_dump(by_alias=True)

        glm_doc = self.model.apply_on_doc(ds_doc_dict)
-        # ds_doc_dict = to_legacy_document_format(
-        #    glm_doc, ds_doc_dict, update_name_label=True
-        # )

        docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
-        # legacy_doc: DsLegacyDocument = None
-
-        # if self.create_legacy_output:
-        #    legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)

        # DEBUG code:
        def draw_clusters_and_cells(ds_document, page_no):
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -5,8 +5,7 @@ import time
 from pathlib import Path
 from typing import Iterable, List

-from docling_core.types.doc import CoordOrigin
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import CoordOrigin, DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import ImageDraw

--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -3,9 +3,7 @@ from pathlib import Path
 from typing import Iterable, List

 import numpy
-from docling_core.types.doc import BoundingBox
-from docling_core.types.doc.document import TableCell
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw

--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -2,7 +2,7 @@ import logging
 from pathlib import Path
 from typing import Optional

-from docling_core.types.doc.document import DocItem, ImageRef, PictureItem, TableItem
+from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem

 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
--- a/docling/utils/layout_utils.py
+++ b/docling/utils/layout_utils.py
@ -2,7 +2,7 @@ import copy
 import logging

 import networkx as nx
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import DocItemLabel

 logger = logging.getLogger("layout_utils")

--- a/docs/assets/docling_doc_hierarchy_1.png
+++ b/docs/assets/docling_doc_hierarchy_1.png
--- a/docs/assets/docling_doc_hierarchy_2.png
+++ b/docs/assets/docling_doc_hierarchy_2.png
--- a/docs/concepts/docling_format.md
+++ b/docs/concepts/docling_format.md
@ -0,0 +1,64 @@
+With Docling v2, we introduce a unified document representation format called `DoclingDocument`. It is defined as a 
+pydantic datatype, which can express several features common to documents, such as:
+* Text, Tables, Pictures, and more
+* Document hierarchy with sections and groups
+* Disambiguation between main body and headers, footers (furniture)
+* Layout information (i.e. bounding boxes) for all items, if available
+* Provenance information
+
+It also brings a set of document construction APIs to build up a `DoclingDocument` from scratch.
+
+# Example document structures
+
+To illustrate the features of the `DoclingDocument` format, consider the following side-by-side comparison of a
+`DoclingDocument` converted from `test/data/word_sample.docx`. Left side shows snippets from the converted document 
+serialized as YAML, right side shows the corresponding visual parts in MS Word.
+
+## Basic structure
+
+A `DoclingDocument` exposes top-level fields for the document content, organized in two categories. 
+The first category is the _content items_, which are stored in these fields:
+
+- `texts`: All items that have a text representation (paragraph, section heading, equation, ...). Base class is `TextItem`.
+- `tables`: All tables, type `TableItem`. Can carry structure annotations.
+- `pictures`: All pictures, type `PictureItem`. Can carry structure annotations.
+- `key_value_items`: All key-value items.
+
+All of the above fields are lists and store items inheriting from the `DocItem` type. They can express different
+data structures depending on their type, and reference parents and children through JSON pointers.
+
+The second category is _content structure_, which is encapsualted in:
+
+- `body`: The root node of a tree-structure for the main document body
+- `furniture`: The root node of a tree-structure for all items that don't belong into the body (headers, footers, ...)
+- `groups`: A set of items that don't represent content, but act as containers for other content items (e.g. a list, a chapter)
+
+All of the above fields are only storing `NodeItem` instances, which reference children and parents 
+through JSON pointers. 
+
+The reading order of the document is encapsulated through the `body` tree and the order of _children_ in each item
+in the tree.
+
+Below example shows how all items in the first page are nested below the `title` item (`#/texts/1`). 
+
+![doc_hierarchy_1](../assets/docling_doc_hierarchy_1.png)
+
+## Grouping
+
+Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as chilrden. The children of
+"Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the 
+top-level `groups` field.
+
+![doc_hierarchy_2](../assets/docling_doc_hierarchy_2.png)
+
+## Tables
+
+TBD
+
+## Pictures
+
+TBD
+
+## Provenance
+
+TBD
--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Any, Iterable

 from docling_core.types.doc import DoclingDocument, NodeItem
-from docling_core.types.doc.document import PictureClassificationData, PictureItem, PictureClassificationClass
+from docling_core.types.doc import PictureClassificationData, PictureItem, PictureClassificationClass

 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@ -2,7 +2,7 @@ import logging
 import time
 from pathlib import Path

-from docling_core.types.doc.document import PictureItem, TableItem
+from docling_core.types.doc import PictureItem, TableItem

 from docling.datamodel.base_models import FigureElement, InputFormat, Table
 from docling.datamodel.pipeline_options import PdfPipelineOptions
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@ -16,23 +16,22 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

 _log = logging.getLogger(__name__)

-USE_EXPERIMENTAL = False
-
-input_paths = [
+def main():
+    input_paths = [
        Path("tests/data/wiki_duck.html"),
        Path("tests/data/word_sample.docx"),
        Path("tests/data/lorem_ipsum.docx"),
        Path("tests/data/powerpoint_sample.pptx"),
        Path("tests/data/2305.03393v1-pg9-img.png"),
        Path("tests/data/2206.01062.pdf"),
-]
+    ]

-## for defaults use:
-# doc_converter = DocumentConverter()
+    ## for defaults use:
+    # doc_converter = DocumentConverter()

-## to customize use:
+    ## to customize use:

-doc_converter = (
+    doc_converter = (
        DocumentConverter(  # all of the below is optional, has internal defaults.
            allowed_formats=[
                InputFormat.PDF,
@ -50,11 +49,11 @@ doc_converter = (
                ),
            },
        )
-)
+    )

-conv_results = doc_converter.convert_all(input_paths)
+    conv_results = doc_converter.convert_all(input_paths)

-for res in conv_results:
+    for res in conv_results:
        out_path = Path("scratch")
        print(
            f"Document {res.input.file.name} converted."
@ -70,3 +69,5 @@ for res in conv_results:

        with (out_path / f"{res.input.file.name}.yaml").open("w") as fp:
            fp.write(yaml.safe_dump(res.document.export_to_dict()))
+if __name__ == "__main__":
+    main()
--- a/docs/v2.md
+++ b/docs/v2.md
@ -92,8 +92,9 @@ doc_converter = (
 **Note**: If you work only with defaults, all remains the same as in Docling v1.

 More options are shown in the following example units:
- [run_with_formats.py](docs/examples/run_with_formats.py)
- [custom_convert.py](docs/examples/custom_convert.py)
+
+- [run_with_formats.py](../examples/run_with_formats/)
+- [custom_convert.py](../examples/custom_convert/)

 ### Converting documents

--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -55,15 +55,17 @@ nav:
    - Home: index.md
    - Installation: installation.md
    - Docling v2: v2.md
-  # - Concepts:
-  #   - Docling Document: concepts/document.md
+  - Concepts:
+    - The Docling Document format: concepts/docling_format.md
  #   - Chunking: concepts/chunking.md
  - Examples:
    - Conversion:
      - "Simple conversion": examples/minimal.py
      - "Custom conversion": examples/custom_convert.py
      - "Batch conversion": examples/batch_convert.py
+      - "Multi-format conversion": examples/run_with_formats.py
      - "Figure export": examples/export_figures.py
+      - "Figure enrichment": examples/develop_picture_enrichment.py
      - "Table export": examples/export_tables.py
      - "Multimodal export": examples/export_multimodal.py
    - RAG / QA:
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/redp5110.json
+++ b/tests/data/groundtruth/docling_v2/redp5110.json
--- a/tests/data/groundtruth/docling_v2/redp5110.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110.pages.json
--- a/tests/data/groundtruth/docling_v2/redp5695.json
+++ b/tests/data/groundtruth/docling_v2/redp5695.json
--- a/tests/data/groundtruth/docling_v2/redp5695.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5695.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
@ -1 +1 @@
-{"schema_name": "DoclingDocument", "version": "1.0.0", "description": {}, "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
+{"schema_name": "DoclingDocument", "version": "1.0.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
--- a/tests/test_backend_pdfium.py
+++ b/tests/test_backend_pdfium.py
@ -1,7 +1,7 @@
 from pathlib import Path

 import pytest
-from docling_core.types.doc.base import BoundingBox
+from docling_core.types.doc import BoundingBox

 from docling.backend.pypdfium2_backend import (
    PyPdfiumDocumentBackend,