Documentation updates, remove DescriptionItem in DoclingDocument init

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-16 16:05:49 +02:00
parent 07206c5b3e
commit 734d77c8ae
31 changed files with 154 additions and 100 deletions

View File

@ -5,12 +5,12 @@ from typing import Set, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from docling_core.types.doc import ( from docling_core.types.doc import (
DescriptionItem, DocItemLabel,
DoclingDocument, DoclingDocument,
GroupLabel,
TableCell, TableCell,
TableData, TableData,
) )
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff # access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy") doc = DoclingDocument(name="dummy")
_log.debug("Trying to convert HTML...") _log.debug("Trying to convert HTML...")
if self.is_valid(): if self.is_valid():

View File

@ -4,16 +4,17 @@ from pathlib import Path
from typing import Set, Union from typing import Set, Union
from docling_core.types.doc import ( from docling_core.types.doc import (
DescriptionItem, BoundingBox,
CoordOrigin,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
GroupLabel, GroupLabel,
ProvenanceItem, ProvenanceItem,
Size,
TableCell, TableCell,
TableData, TableData,
) )
from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
from pptx import Presentation from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@ -96,7 +97,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
else: else:
docname = "stream" docname = "stream"
doc = DoclingDocument( doc = DoclingDocument(
description=DescriptionItem(), name=docname, origin=origin name=docname, origin=origin
) # must add origin information ) # must add origin information
doc = self.walk_linear(self.pptx_obj, doc) doc = self.walk_linear(self.pptx_obj, doc)

View File

@ -5,7 +5,6 @@ from typing import Set, Union
import docx import docx
from docling_core.types.doc import ( from docling_core.types.doc import (
DescriptionItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
@ -99,9 +98,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
docname = Path(fname).stem docname = Path(fname).stem
else: else:
docname = "stream" docname = "stream"
doc = DoclingDocument( doc = DoclingDocument(name=docname, origin=origin)
description=DescriptionItem(), name=docname, origin=origin
)
if self.is_valid(): if self.is_valid():
assert self.docx_obj is not None assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)

View File

@ -2,9 +2,13 @@ from enum import Enum, auto
from io import BytesIO from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size from docling_core.types.doc import (
from docling_core.types.doc.document import PictureDataType, TableCell BoundingBox,
from docling_core.types.doc.labels import DocItemLabel DocItemLabel,
PictureDataType,
Size,
TableCell,
)
from PIL.Image import Image from PIL.Image import Image
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict

View File

@ -13,7 +13,6 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable from docling_core.types import Table as DsSchemaTable
from docling_core.types.doc import ( from docling_core.types.doc import (
DescriptionItem,
DocItem, DocItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
@ -37,11 +36,9 @@ from docling.datamodel.base_models import (
ConversionStatus, ConversionStatus,
DocumentStream, DocumentStream,
ErrorItem, ErrorItem,
FigureElement,
InputFormat, InputFormat,
MimeTypeToFormat, MimeTypeToFormat,
Page, Page,
PageElement,
) )
from docling.datamodel.settings import DocumentLimits from docling.datamodel.settings import DocumentLimits
from docling.utils.utils import create_file_hash, create_hash from docling.utils.utils import create_file_hash, create_hash
@ -70,9 +67,7 @@ layout_label_to_ds_type = {
DocItemLabel.PARAGRAPH: "paragraph", DocItemLabel.PARAGRAPH: "paragraph",
} }
_EMPTY_DOCLING_DOC = DoclingDocument( _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
description=DescriptionItem(), name="dummy"
) # TODO: Stub
class InputDocument(BaseModel): class InputDocument(BaseModel):

View File

@ -11,8 +11,7 @@ from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable from docling_core.types import Table as DsSchemaTable
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.doc.document import DoclingDocument
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy_doc.base import Figure, TableCell from docling_core.types.legacy_doc.base import Figure, TableCell
from PIL import ImageDraw from PIL import ImageDraw
@ -222,15 +221,8 @@ class GlmModel:
ds_doc_dict = ds_doc.model_dump(by_alias=True) ds_doc_dict = ds_doc.model_dump(by_alias=True)
glm_doc = self.model.apply_on_doc(ds_doc_dict) glm_doc = self.model.apply_on_doc(ds_doc_dict)
# ds_doc_dict = to_legacy_document_format(
# glm_doc, ds_doc_dict, update_name_label=True
# )
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
# legacy_doc: DsLegacyDocument = None
# if self.create_legacy_output:
# legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
# DEBUG code: # DEBUG code:
def draw_clusters_and_cells(ds_document, page_no): def draw_clusters_and_cells(ds_document, page_no):

View File

@ -5,8 +5,7 @@ import time
from pathlib import Path from pathlib import Path
from typing import Iterable, List from typing import Iterable, List
from docling_core.types.doc import CoordOrigin from docling_core.types.doc import CoordOrigin, DocItemLabel
from docling_core.types.doc.labels import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw from PIL import ImageDraw

View File

@ -3,9 +3,7 @@ from pathlib import Path
from typing import Iterable, List from typing import Iterable, List
import numpy import numpy
from docling_core.types.doc import BoundingBox from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_core.types.doc.document import TableCell
from docling_core.types.doc.labels import DocItemLabel
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw from PIL import ImageDraw

View File

@ -2,7 +2,7 @@ import logging
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from docling_core.types.doc.document import DocItem, ImageRef, PictureItem, TableItem from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend

View File

@ -2,7 +2,7 @@ import copy
import logging import logging
import networkx as nx import networkx as nx
from docling_core.types.doc.labels import DocItemLabel from docling_core.types.doc import DocItemLabel
logger = logging.getLogger("layout_utils") logger = logging.getLogger("layout_utils")

Binary file not shown.

After

Width:  |  Height:  |  Size: 369 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 358 KiB

View File

@ -0,0 +1,64 @@
With Docling v2, we introduce a unified document representation format called `DoclingDocument`. It is defined as a
pydantic datatype, which can express several features common to documents, such as:
* Text, Tables, Pictures, and more
* Document hierarchy with sections and groups
* Disambiguation between main body and headers, footers (furniture)
* Layout information (i.e. bounding boxes) for all items, if available
* Provenance information
It also brings a set of document construction APIs to build up a `DoclingDocument` from scratch.
# Example document structures
To illustrate the features of the `DoclingDocument` format, consider the following side-by-side comparison of a
`DoclingDocument` converted from `test/data/word_sample.docx`. Left side shows snippets from the converted document
serialized as YAML, right side shows the corresponding visual parts in MS Word.
## Basic structure
A `DoclingDocument` exposes top-level fields for the document content, organized in two categories.
The first category is the _content items_, which are stored in these fields:
- `texts`: All items that have a text representation (paragraph, section heading, equation, ...). Base class is `TextItem`.
- `tables`: All tables, type `TableItem`. Can carry structure annotations.
- `pictures`: All pictures, type `PictureItem`. Can carry structure annotations.
- `key_value_items`: All key-value items.
All of the above fields are lists and store items inheriting from the `DocItem` type. They can express different
data structures depending on their type, and reference parents and children through JSON pointers.
The second category is _content structure_, which is encapsualted in:
- `body`: The root node of a tree-structure for the main document body
- `furniture`: The root node of a tree-structure for all items that don't belong into the body (headers, footers, ...)
- `groups`: A set of items that don't represent content, but act as containers for other content items (e.g. a list, a chapter)
All of the above fields are only storing `NodeItem` instances, which reference children and parents
through JSON pointers.
The reading order of the document is encapsulated through the `body` tree and the order of _children_ in each item
in the tree.
Below example shows how all items in the first page are nested below the `title` item (`#/texts/1`).
![doc_hierarchy_1](../assets/docling_doc_hierarchy_1.png)
## Grouping
Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as chilrden. The children of
"Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the
top-level `groups` field.
![doc_hierarchy_2](../assets/docling_doc_hierarchy_2.png)
## Tables
TBD
## Pictures
TBD
## Provenance
TBD

View File

@ -3,7 +3,7 @@ from pathlib import Path
from typing import Any, Iterable from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem from docling_core.types.doc import DoclingDocument, NodeItem
from docling_core.types.doc.document import PictureClassificationData, PictureItem, PictureClassificationClass from docling_core.types.doc import PictureClassificationData, PictureItem, PictureClassificationClass
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions

View File

@ -2,7 +2,7 @@ import logging
import time import time
from pathlib import Path from pathlib import Path
from docling_core.types.doc.document import PictureItem, TableItem from docling_core.types.doc import PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions

View File

@ -16,57 +16,58 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
USE_EXPERIMENTAL = False def main():
input_paths = [
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
]
input_paths = [ ## for defaults use:
Path("tests/data/wiki_duck.html"), # doc_converter = DocumentConverter()
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
]
## for defaults use: ## to customize use:
# doc_converter = DocumentConverter()
## to customize use: doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
doc_converter = ( allowed_formats=[
DocumentConverter( # all of the below is optional, has internal defaults. InputFormat.PDF,
allowed_formats=[ InputFormat.IMAGE,
InputFormat.PDF, InputFormat.DOCX,
InputFormat.IMAGE, InputFormat.HTML,
InputFormat.DOCX, InputFormat.PPTX,
InputFormat.HTML, ], # whitelist formats, non-matching files are ignored.
InputFormat.PPTX, format_options={
], # whitelist formats, non-matching files are ignored. InputFormat.PDF: PdfFormatOption(
format_options={ pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
InputFormat.PDF: PdfFormatOption( ),
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend InputFormat.DOCX: WordFormatOption(
), pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
InputFormat.DOCX: WordFormatOption( ),
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend },
), )
},
) )
)
conv_results = doc_converter.convert_all(input_paths) conv_results = doc_converter.convert_all(input_paths)
for res in conv_results: for res in conv_results:
out_path = Path("scratch") out_path = Path("scratch")
print( print(
f"Document {res.input.file.name} converted." f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}" f"\nSaved markdown output to: {str(out_path)}"
) )
# print(res.docdocument.export_to_markdown()) # print(res.docdocument.export_to_markdown())
# Export Docling document format to markdowndoc: # Export Docling document format to markdowndoc:
with (out_path / f"{res.input.file.name}.md").open("w") as fp: with (out_path / f"{res.input.file.name}.md").open("w") as fp:
fp.write(res.document.export_to_markdown()) fp.write(res.document.export_to_markdown())
with (out_path / f"{res.input.file.name}.json").open("w") as fp: with (out_path / f"{res.input.file.name}.json").open("w") as fp:
fp.write(json.dumps(res.document.export_to_dict())) fp.write(json.dumps(res.document.export_to_dict()))
with (out_path / f"{res.input.file.name}.yaml").open("w") as fp: with (out_path / f"{res.input.file.name}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(res.document.export_to_dict())) fp.write(yaml.safe_dump(res.document.export_to_dict()))
if __name__ == "__main__":
main()

View File

@ -92,8 +92,9 @@ doc_converter = (
**Note**: If you work only with defaults, all remains the same as in Docling v1. **Note**: If you work only with defaults, all remains the same as in Docling v1.
More options are shown in the following example units: More options are shown in the following example units:
- [run_with_formats.py](docs/examples/run_with_formats.py)
- [custom_convert.py](docs/examples/custom_convert.py) - [run_with_formats.py](../examples/run_with_formats/)
- [custom_convert.py](../examples/custom_convert/)
### Converting documents ### Converting documents

View File

@ -55,15 +55,17 @@ nav:
- Home: index.md - Home: index.md
- Installation: installation.md - Installation: installation.md
- Docling v2: v2.md - Docling v2: v2.md
# - Concepts: - Concepts:
# - Docling Document: concepts/document.md - The Docling Document format: concepts/docling_format.md
# - Chunking: concepts/chunking.md # - Chunking: concepts/chunking.md
- Examples: - Examples:
- Conversion: - Conversion:
- "Simple conversion": examples/minimal.py - "Simple conversion": examples/minimal.py
- "Custom conversion": examples/custom_convert.py - "Custom conversion": examples/custom_convert.py
- "Batch conversion": examples/batch_convert.py - "Batch conversion": examples/batch_convert.py
- "Multi-format conversion": examples/run_with_formats.py
- "Figure export": examples/export_figures.py - "Figure export": examples/export_figures.py
- "Figure enrichment": examples/develop_picture_enrichment.py
- "Table export": examples/export_tables.py - "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py - "Multimodal export": examples/export_multimodal.py
- RAG / QA: - RAG / QA:

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1 +1 @@
{"schema_name": "DoclingDocument", "version": "1.0.0", "description": {}, "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}} {"schema_name": "DoclingDocument", "version": "1.0.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}

View File

@ -1,7 +1,7 @@
from pathlib import Path from pathlib import Path
import pytest import pytest
from docling_core.types.doc.base import BoundingBox from docling_core.types.doc import BoundingBox
from docling.backend.pypdfium2_backend import ( from docling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend, PyPdfiumDocumentBackend,