Documentation updates, remove DescriptionItem in DoclingDocument init

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-16 16:05:49 +02:00
parent 07206c5b3e
commit 734d77c8ae
31 changed files with 154 additions and 100 deletions

View File

@ -5,12 +5,12 @@ from typing import Set, Union
from bs4 import BeautifulSoup
from docling_core.types.doc import (
DescriptionItem,
DocItemLabel,
DoclingDocument,
GroupLabel,
TableCell,
TableData,
)
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
doc = DoclingDocument(name="dummy")
_log.debug("Trying to convert HTML...")
if self.is_valid():

View File

@ -4,16 +4,17 @@ from pathlib import Path
from typing import Set, Union
from docling_core.types.doc import (
DescriptionItem,
BoundingBox,
CoordOrigin,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
ProvenanceItem,
Size,
TableCell,
TableData,
)
from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@ -96,7 +97,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
else:
docname = "stream"
doc = DoclingDocument(
description=DescriptionItem(), name=docname, origin=origin
name=docname, origin=origin
) # must add origin information
doc = self.walk_linear(self.pptx_obj, doc)

View File

@ -5,7 +5,6 @@ from typing import Set, Union
import docx
from docling_core.types.doc import (
DescriptionItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
@ -99,9 +98,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(
description=DescriptionItem(), name=docname, origin=origin
)
doc = DoclingDocument(name=docname, origin=origin)
if self.is_valid():
assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)

View File

@ -2,9 +2,13 @@ from enum import Enum, auto
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.document import PictureDataType, TableCell
from docling_core.types.doc.labels import DocItemLabel
from docling_core.types.doc import (
BoundingBox,
DocItemLabel,
PictureDataType,
Size,
TableCell,
)
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict

View File

@ -13,7 +13,6 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types.doc import (
DescriptionItem,
DocItem,
DocItemLabel,
DoclingDocument,
@ -37,11 +36,9 @@ from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
ErrorItem,
FigureElement,
InputFormat,
MimeTypeToFormat,
Page,
PageElement,
)
from docling.datamodel.settings import DocumentLimits
from docling.utils.utils import create_file_hash, create_hash
@ -70,9 +67,7 @@ layout_label_to_ds_type = {
DocItemLabel.PARAGRAPH: "paragraph",
}
_EMPTY_DOCLING_DOC = DoclingDocument(
description=DescriptionItem(), name="dummy"
) # TODO: Stub
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
class InputDocument(BaseModel):

View File

@ -11,8 +11,7 @@ from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.document import DoclingDocument
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy_doc.base import Figure, TableCell
from PIL import ImageDraw
@ -222,15 +221,8 @@ class GlmModel:
ds_doc_dict = ds_doc.model_dump(by_alias=True)
glm_doc = self.model.apply_on_doc(ds_doc_dict)
# ds_doc_dict = to_legacy_document_format(
# glm_doc, ds_doc_dict, update_name_label=True
# )
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
# legacy_doc: DsLegacyDocument = None
# if self.create_legacy_output:
# legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
# DEBUG code:
def draw_clusters_and_cells(ds_document, page_no):

View File

@ -5,8 +5,7 @@ import time
from pathlib import Path
from typing import Iterable, List
from docling_core.types.doc import CoordOrigin
from docling_core.types.doc.labels import DocItemLabel
from docling_core.types.doc import CoordOrigin, DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw

View File

@ -3,9 +3,7 @@ from pathlib import Path
from typing import Iterable, List
import numpy
from docling_core.types.doc import BoundingBox
from docling_core.types.doc.document import TableCell
from docling_core.types.doc.labels import DocItemLabel
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw

View File

@ -2,7 +2,7 @@ import logging
from pathlib import Path
from typing import Optional
from docling_core.types.doc.document import DocItem, ImageRef, PictureItem, TableItem
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend

View File

@ -2,7 +2,7 @@ import copy
import logging
import networkx as nx
from docling_core.types.doc.labels import DocItemLabel
from docling_core.types.doc import DocItemLabel
logger = logging.getLogger("layout_utils")

Binary file not shown.

After

Width:  |  Height:  |  Size: 369 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 358 KiB

View File

@ -0,0 +1,64 @@
With Docling v2, we introduce a unified document representation format called `DoclingDocument`. It is defined as a
pydantic datatype, which can express several features common to documents, such as:
* Text, Tables, Pictures, and more
* Document hierarchy with sections and groups
* Disambiguation between main body and headers, footers (furniture)
* Layout information (i.e. bounding boxes) for all items, if available
* Provenance information
It also brings a set of document construction APIs to build up a `DoclingDocument` from scratch.
# Example document structures
To illustrate the features of the `DoclingDocument` format, consider the following side-by-side comparison of a
`DoclingDocument` converted from `test/data/word_sample.docx`. Left side shows snippets from the converted document
serialized as YAML, right side shows the corresponding visual parts in MS Word.
## Basic structure
A `DoclingDocument` exposes top-level fields for the document content, organized in two categories.
The first category is the _content items_, which are stored in these fields:
- `texts`: All items that have a text representation (paragraph, section heading, equation, ...). Base class is `TextItem`.
- `tables`: All tables, type `TableItem`. Can carry structure annotations.
- `pictures`: All pictures, type `PictureItem`. Can carry structure annotations.
- `key_value_items`: All key-value items.
All of the above fields are lists and store items inheriting from the `DocItem` type. They can express different
data structures depending on their type, and reference parents and children through JSON pointers.
The second category is _content structure_, which is encapsualted in:
- `body`: The root node of a tree-structure for the main document body
- `furniture`: The root node of a tree-structure for all items that don't belong into the body (headers, footers, ...)
- `groups`: A set of items that don't represent content, but act as containers for other content items (e.g. a list, a chapter)
All of the above fields are only storing `NodeItem` instances, which reference children and parents
through JSON pointers.
The reading order of the document is encapsulated through the `body` tree and the order of _children_ in each item
in the tree.
Below example shows how all items in the first page are nested below the `title` item (`#/texts/1`).
![doc_hierarchy_1](../assets/docling_doc_hierarchy_1.png)
## Grouping
Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as chilrden. The children of
"Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the
top-level `groups` field.
![doc_hierarchy_2](../assets/docling_doc_hierarchy_2.png)
## Tables
TBD
## Pictures
TBD
## Provenance
TBD

View File

@ -3,7 +3,7 @@ from pathlib import Path
from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem
from docling_core.types.doc.document import PictureClassificationData, PictureItem, PictureClassificationClass
from docling_core.types.doc import PictureClassificationData, PictureItem, PictureClassificationClass
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions

View File

@ -2,7 +2,7 @@ import logging
import time
from pathlib import Path
from docling_core.types.doc.document import PictureItem, TableItem
from docling_core.types.doc import PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions

View File

@ -16,23 +16,22 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__)
USE_EXPERIMENTAL = False
input_paths = [
def main():
input_paths = [
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
]
]
## for defaults use:
# doc_converter = DocumentConverter()
## for defaults use:
# doc_converter = DocumentConverter()
## to customize use:
## to customize use:
doc_converter = (
doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
allowed_formats=[
InputFormat.PDF,
@ -50,11 +49,11 @@ doc_converter = (
),
},
)
)
)
conv_results = doc_converter.convert_all(input_paths)
conv_results = doc_converter.convert_all(input_paths)
for res in conv_results:
for res in conv_results:
out_path = Path("scratch")
print(
f"Document {res.input.file.name} converted."
@ -70,3 +69,5 @@ for res in conv_results:
with (out_path / f"{res.input.file.name}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(res.document.export_to_dict()))
if __name__ == "__main__":
main()

View File

@ -92,8 +92,9 @@ doc_converter = (
**Note**: If you work only with defaults, all remains the same as in Docling v1.
More options are shown in the following example units:
- [run_with_formats.py](docs/examples/run_with_formats.py)
- [custom_convert.py](docs/examples/custom_convert.py)
- [run_with_formats.py](../examples/run_with_formats/)
- [custom_convert.py](../examples/custom_convert/)
### Converting documents

View File

@ -55,15 +55,17 @@ nav:
- Home: index.md
- Installation: installation.md
- Docling v2: v2.md
# - Concepts:
# - Docling Document: concepts/document.md
- Concepts:
- The Docling Document format: concepts/docling_format.md
# - Chunking: concepts/chunking.md
- Examples:
- Conversion:
- "Simple conversion": examples/minimal.py
- "Custom conversion": examples/custom_convert.py
- "Batch conversion": examples/batch_convert.py
- "Multi-format conversion": examples/run_with_formats.py
- "Figure export": examples/export_figures.py
- "Figure enrichment": examples/develop_picture_enrichment.py
- "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py
- RAG / QA:

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1 +1 @@
{"schema_name": "DoclingDocument", "version": "1.0.0", "description": {}, "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
{"schema_name": "DoclingDocument", "version": "1.0.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}

View File

@ -1,7 +1,7 @@
from pathlib import Path
import pytest
from docling_core.types.doc.base import BoundingBox
from docling_core.types.doc import BoundingBox
from docling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend,