Documentation updates, remove DescriptionItem in DoclingDocument init

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-16 16:05:49 +02:00 · 2024-10-16 16:05:49 +02:00 · 734d77c8ae
commit 734d77c8ae
parent 07206c5b3e
31 changed files with 154 additions and 100 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -5,12 +5,12 @@ from typing import Set, Union
 from bs4 import BeautifulSoup
 from docling_core.types.doc import (
-    DescriptionItem,
+    DocItemLabel,
    DoclingDocument,
    GroupLabel,
    TableCell,
    TableData,
 )
 from docling_core.types.doc.labels import DocItemLabel, GroupLabel
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
-        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
+        doc = DoclingDocument(name="dummy")
        _log.debug("Trying to convert HTML...")
        if self.is_valid():
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -4,16 +4,17 @@ from pathlib import Path
 from typing import Set, Union
 from docling_core.types.doc import (
-    DescriptionItem,
+    BoundingBox,
    CoordOrigin,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
    GroupLabel,
    ProvenanceItem,
    Size,
    TableCell,
    TableData,
 )
 from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@ -96,7 +97,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        else:
            docname = "stream"
        doc = DoclingDocument(
-            description=DescriptionItem(), name=docname, origin=origin
+            name=docname, origin=origin
        )  # must add origin information
        doc = self.walk_linear(self.pptx_obj, doc)
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -5,7 +5,6 @@ from typing import Set, Union
 import docx
 from docling_core.types.doc import (
    DescriptionItem,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
@ -99,9 +98,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            docname = Path(fname).stem
        else:
            docname = "stream"
-        doc = DoclingDocument(
+        doc = DoclingDocument(name=docname, origin=origin)
            description=DescriptionItem(), name=docname, origin=origin
        )
        if self.is_valid():
            assert self.docx_obj is not None
            doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -2,9 +2,13 @@ from enum import Enum, auto
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
-from docling_core.types.doc import BoundingBox, Size
+from docling_core.types.doc import (
-from docling_core.types.doc.document import PictureDataType, TableCell
+    BoundingBox,
-from docling_core.types.doc.labels import DocItemLabel
+    DocItemLabel,
    PictureDataType,
    Size,
    TableCell,
 )
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -13,7 +13,6 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types.doc import (
    DescriptionItem,
    DocItem,
    DocItemLabel,
    DoclingDocument,
@ -37,11 +36,9 @@ from docling.datamodel.base_models import (
    ConversionStatus,
    DocumentStream,
    ErrorItem,
    FigureElement,
    InputFormat,
    MimeTypeToFormat,
    Page,
    PageElement,
 )
 from docling.datamodel.settings import DocumentLimits
 from docling.utils.utils import create_file_hash, create_hash
@ -70,9 +67,7 @@ layout_label_to_ds_type = {
    DocItemLabel.PARAGRAPH: "paragraph",
 }
-_EMPTY_DOCLING_DOC = DoclingDocument(
+_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
    description=DescriptionItem(), name="dummy"
 )  # TODO: Stub
 class InputDocument(BaseModel):
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -11,8 +11,7 @@ from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
-from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
 from docling_core.types.doc.document import DoclingDocument
 from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.legacy_doc.base import Figure, TableCell
 from PIL import ImageDraw
@ -222,15 +221,8 @@ class GlmModel:
        ds_doc_dict = ds_doc.model_dump(by_alias=True)
        glm_doc = self.model.apply_on_doc(ds_doc_dict)
        # ds_doc_dict = to_legacy_document_format(
        #    glm_doc, ds_doc_dict, update_name_label=True
        # )
        docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
        # legacy_doc: DsLegacyDocument = None
        # if self.create_legacy_output:
        #    legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
        # DEBUG code:
        def draw_clusters_and_cells(ds_document, page_no):
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -5,8 +5,7 @@ import time
 from pathlib import Path
 from typing import Iterable, List
-from docling_core.types.doc import CoordOrigin
+from docling_core.types.doc import CoordOrigin, DocItemLabel
 from docling_core.types.doc.labels import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import ImageDraw
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -3,9 +3,7 @@ from pathlib import Path
 from typing import Iterable, List
 import numpy
-from docling_core.types.doc import BoundingBox
+from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
 from docling_core.types.doc.document import TableCell
 from docling_core.types.doc.labels import DocItemLabel
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -2,7 +2,7 @@ import logging
 from pathlib import Path
 from typing import Optional
-from docling_core.types.doc.document import DocItem, ImageRef, PictureItem, TableItem
+from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
--- a/docling/utils/layout_utils.py
+++ b/docling/utils/layout_utils.py
@ -2,7 +2,7 @@ import copy
 import logging
 import networkx as nx
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import DocItemLabel
 logger = logging.getLogger("layout_utils")
--- a/docs/assets/docling_doc_hierarchy_1.png
+++ b/docs/assets/docling_doc_hierarchy_1.png
--- a/docs/assets/docling_doc_hierarchy_2.png
+++ b/docs/assets/docling_doc_hierarchy_2.png
--- a/docs/concepts/docling_format.md
+++ b/docs/concepts/docling_format.md
@ -0,0 +1,64 @@
 With Docling v2, we introduce a unified document representation format called `DoclingDocument`. It is defined as a 
 pydantic datatype, which can express several features common to documents, such as:
 * Text, Tables, Pictures, and more
 * Document hierarchy with sections and groups
 * Disambiguation between main body and headers, footers (furniture)
 * Layout information (i.e. bounding boxes) for all items, if available
 * Provenance information
 It also brings a set of document construction APIs to build up a `DoclingDocument` from scratch.
 # Example document structures
 To illustrate the features of the `DoclingDocument` format, consider the following side-by-side comparison of a
 `DoclingDocument` converted from `test/data/word_sample.docx`. Left side shows snippets from the converted document 
 serialized as YAML, right side shows the corresponding visual parts in MS Word.
 ## Basic structure
 A `DoclingDocument` exposes top-level fields for the document content, organized in two categories. 
 The first category is the _content items_, which are stored in these fields:
 - `texts`: All items that have a text representation (paragraph, section heading, equation, ...). Base class is `TextItem`.
 - `tables`: All tables, type `TableItem`. Can carry structure annotations.
 - `pictures`: All pictures, type `PictureItem`. Can carry structure annotations.
 - `key_value_items`: All key-value items.
 All of the above fields are lists and store items inheriting from the `DocItem` type. They can express different
 data structures depending on their type, and reference parents and children through JSON pointers.
 The second category is _content structure_, which is encapsualted in:
 - `body`: The root node of a tree-structure for the main document body
 - `furniture`: The root node of a tree-structure for all items that don't belong into the body (headers, footers, ...)
 - `groups`: A set of items that don't represent content, but act as containers for other content items (e.g. a list, a chapter)
 All of the above fields are only storing `NodeItem` instances, which reference children and parents 
 through JSON pointers. 
 The reading order of the document is encapsulated through the `body` tree and the order of _children_ in each item
 in the tree.
 Below example shows how all items in the first page are nested below the `title` item (`#/texts/1`). 
 ![doc_hierarchy_1](../assets/docling_doc_hierarchy_1.png)
 ## Grouping
 Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as chilrden. The children of
 "Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the 
 top-level `groups` field.
 ![doc_hierarchy_2](../assets/docling_doc_hierarchy_2.png)
 ## Tables
 TBD
 ## Pictures
 TBD
 ## Provenance
 TBD
--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Any, Iterable
 from docling_core.types.doc import DoclingDocument, NodeItem
-from docling_core.types.doc.document import PictureClassificationData, PictureItem, PictureClassificationClass
+from docling_core.types.doc import PictureClassificationData, PictureItem, PictureClassificationClass
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@ -2,7 +2,7 @@ import logging
 import time
 from pathlib import Path
-from docling_core.types.doc.document import PictureItem, TableItem
+from docling_core.types.doc import PictureItem, TableItem
 from docling.datamodel.base_models import FigureElement, InputFormat, Table
 from docling.datamodel.pipeline_options import PdfPipelineOptions
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@ -16,57 +16,58 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 _log = logging.getLogger(__name__)
-USE_EXPERIMENTAL = False
+def main():
    input_paths = [
        Path("tests/data/wiki_duck.html"),
        Path("tests/data/word_sample.docx"),
        Path("tests/data/lorem_ipsum.docx"),
        Path("tests/data/powerpoint_sample.pptx"),
        Path("tests/data/2305.03393v1-pg9-img.png"),
        Path("tests/data/2206.01062.pdf"),
    ]
-input_paths = [
+    ## for defaults use:
-    Path("tests/data/wiki_duck.html"),
+    # doc_converter = DocumentConverter()
    Path("tests/data/word_sample.docx"),
    Path("tests/data/lorem_ipsum.docx"),
    Path("tests/data/powerpoint_sample.pptx"),
    Path("tests/data/2305.03393v1-pg9-img.png"),
    Path("tests/data/2206.01062.pdf"),
 ]
-## for defaults use:
+    ## to customize use:
 # doc_converter = DocumentConverter()
-## to customize use:
+    doc_converter = (
-
+        DocumentConverter(  # all of the below is optional, has internal defaults.
-doc_converter = (
+            allowed_formats=[
-    DocumentConverter(  # all of the below is optional, has internal defaults.
+                InputFormat.PDF,
-        allowed_formats=[
+                InputFormat.IMAGE,
-            InputFormat.PDF,
+                InputFormat.DOCX,
-            InputFormat.IMAGE,
+                InputFormat.HTML,
-            InputFormat.DOCX,
+                InputFormat.PPTX,
-            InputFormat.HTML,
+            ],  # whitelist formats, non-matching files are ignored.
-            InputFormat.PPTX,
+            format_options={
-        ],  # whitelist formats, non-matching files are ignored.
+                InputFormat.PDF: PdfFormatOption(
-        format_options={
+                    pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
-            InputFormat.PDF: PdfFormatOption(
+                ),
-                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
+                InputFormat.DOCX: WordFormatOption(
-            ),
+                    pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
-            InputFormat.DOCX: WordFormatOption(
+                ),
-                pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
+            },
-            ),
+        )
        },
    )
 )
-conv_results = doc_converter.convert_all(input_paths)
+    conv_results = doc_converter.convert_all(input_paths)
-for res in conv_results:
+    for res in conv_results:
-    out_path = Path("scratch")
+        out_path = Path("scratch")
-    print(
+        print(
-        f"Document {res.input.file.name} converted."
+            f"Document {res.input.file.name} converted."
-        f"\nSaved markdown output to: {str(out_path)}"
+            f"\nSaved markdown output to: {str(out_path)}"
-    )
+        )
-    # print(res.docdocument.export_to_markdown())
+        # print(res.docdocument.export_to_markdown())
-    # Export Docling document format to markdowndoc:
+        # Export Docling document format to markdowndoc:
-    with (out_path / f"{res.input.file.name}.md").open("w") as fp:
+        with (out_path / f"{res.input.file.name}.md").open("w") as fp:
-        fp.write(res.document.export_to_markdown())
+            fp.write(res.document.export_to_markdown())
-    with (out_path / f"{res.input.file.name}.json").open("w") as fp:
+        with (out_path / f"{res.input.file.name}.json").open("w") as fp:
-        fp.write(json.dumps(res.document.export_to_dict()))
+            fp.write(json.dumps(res.document.export_to_dict()))
-    with (out_path / f"{res.input.file.name}.yaml").open("w") as fp:
+        with (out_path / f"{res.input.file.name}.yaml").open("w") as fp:
-        fp.write(yaml.safe_dump(res.document.export_to_dict()))
+            fp.write(yaml.safe_dump(res.document.export_to_dict()))
 if __name__ == "__main__":
    main()
--- a/docs/v2.md
+++ b/docs/v2.md
@ -92,8 +92,9 @@ doc_converter = (
 **Note**: If you work only with defaults, all remains the same as in Docling v1.
 More options are shown in the following example units:
- [run_with_formats.py](docs/examples/run_with_formats.py)
+
- [custom_convert.py](docs/examples/custom_convert.py)
+- [run_with_formats.py](../examples/run_with_formats/)
 - [custom_convert.py](../examples/custom_convert/)
 ### Converting documents
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -55,15 +55,17 @@ nav:
    - Home: index.md
    - Installation: installation.md
    - Docling v2: v2.md
-  # - Concepts:
+  - Concepts:
-  #   - Docling Document: concepts/document.md
+    - The Docling Document format: concepts/docling_format.md
  #   - Chunking: concepts/chunking.md
  - Examples:
    - Conversion:
      - "Simple conversion": examples/minimal.py
      - "Custom conversion": examples/custom_convert.py
      - "Batch conversion": examples/batch_convert.py
      - "Multi-format conversion": examples/run_with_formats.py
      - "Figure export": examples/export_figures.py
      - "Figure enrichment": examples/develop_picture_enrichment.py
      - "Table export": examples/export_tables.py
      - "Multimodal export": examples/export_multimodal.py
    - RAG / QA:
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/redp5110.json
+++ b/tests/data/groundtruth/docling_v2/redp5110.json
--- a/tests/data/groundtruth/docling_v2/redp5110.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110.pages.json
--- a/tests/data/groundtruth/docling_v2/redp5695.json
+++ b/tests/data/groundtruth/docling_v2/redp5695.json
--- a/tests/data/groundtruth/docling_v2/redp5695.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5695.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
@ -1 +1 @@
-{"schema_name": "DoclingDocument", "version": "1.0.0", "description": {}, "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
+{"schema_name": "DoclingDocument", "version": "1.0.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
--- a/tests/test_backend_pdfium.py
+++ b/tests/test_backend_pdfium.py
@ -1,7 +1,7 @@
 from pathlib import Path
 import pytest
-from docling_core.types.doc.base import BoundingBox
+from docling_core.types.doc import BoundingBox
 from docling.backend.pypdfium2_backend import (
    PyPdfiumDocumentBackend,
		`@ -1 +1 @@`
			{"schema_name": "DoclingDocument", "version": "1.0.0", "description": {}, "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}				{"schema_name": "DoclingDocument", "version": "1.0.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}