Merge branch 'main' into nli/layoutmodel_improvements

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2025-04-24 10:03:05 +02:00
commit a553a1e5bf
15 changed files with 1595 additions and 788 deletions

3
.github/codecov.yml vendored
View File

@ -7,9 +7,10 @@ coverage:
default: default:
informational: true informational: true
target: auto # auto compares coverage to the previous base commit target: auto # auto compares coverage to the previous base commit
if_ci_failed: success
flags: flags:
- docling - docling
comment: comment:
layout: "reach, diff, flags, files" layout: "reach, diff, flags, files"
behavior: default behavior: default
require_changes: false # if true: only post the comment if coverage changes require_changes: false # if true: only post the comment if coverage changes

View File

@ -46,7 +46,7 @@ jobs:
uses: codecov/codecov-action@v5 uses: codecov/codecov-action@v5
with: with:
token: ${{ secrets.CODECOV_TOKEN }} token: ${{ secrets.CODECOV_TOKEN }}
file: ./coverage.xml files: ./coverage.xml
- name: Run examples - name: Run examples
run: | run: |
for file in docs/examples/*.py; do for file in docs/examples/*.py; do

View File

@ -22,6 +22,7 @@
[![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT) [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
[![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
[![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling) [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
[![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/) [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem. Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.

View File

@ -26,6 +26,8 @@ _log = logging.getLogger(__name__)
# tags that generate NodeItem elements # tags that generate NodeItem elements
TAGS_FOR_NODE_ITEMS: Final = [ TAGS_FOR_NODE_ITEMS: Final = [
"address",
"details",
"h1", "h1",
"h2", "h2",
"h3", "h3",
@ -38,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
"ul", "ul",
"ol", "ol",
"li", "li",
"summary",
"table", "table",
"figure", "figure",
"img", "img",
@ -163,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None: def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(tag, doc) self.handle_header(tag, doc)
elif tag.name in ["p"]: elif tag.name in ["p", "address", "summary"]:
self.handle_paragraph(tag, doc) self.handle_paragraph(tag, doc)
elif tag.name in ["pre", "code"]: elif tag.name in ["pre", "code"]:
self.handle_code(tag, doc) self.handle_code(tag, doc)
@ -177,6 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.handle_figure(tag, doc) self.handle_figure(tag, doc)
elif tag.name == "img": elif tag.name == "img":
self.handle_image(tag, doc) self.handle_image(tag, doc)
elif tag.name == "details":
self.handle_details(tag, doc)
else: else:
self.walk(tag, doc) self.walk(tag, doc)
@ -201,6 +206,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return ["".join(result) + " "] return ["".join(result) + " "]
def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
"""Handle details tag (details) and its content."""
self.parents[self.level + 1] = doc.add_group(
name="details",
label=GroupLabel.SECTION,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
self.level += 1
self.walk(element, doc)
self.parents[self.level + 1] = None
self.level -= 1
def handle_header(self, element: Tag, doc: DoclingDocument) -> None: def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles header tags (h1, h2, etc.).""" """Handles header tags (h1, h2, etc.)."""
hlevel = int(element.name.replace("h", "")) hlevel = int(element.name.replace("h", ""))
@ -258,7 +278,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
) )
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p).""" """Handles paragraph tags (p) or equivalent ones."""
if element.text is None: if element.text is None:
return return
text = element.text.strip() text = element.text.strip()

View File

@ -421,7 +421,7 @@ def convert( # noqa: C901
logging.basicConfig(level=logging.WARNING) logging.basicConfig(level=logging.WARNING)
elif verbose == 1: elif verbose == 1:
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
elif verbose == 2: else:
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
settings.debug.visualize_cells = debug_visualize_cells settings.debug.visualize_cells = debug_visualize_cells

View File

@ -6,7 +6,7 @@ For each document format, the *document converter* knows which format-specific *
!!! tip !!! tip
While the document converter holds a default mapping, this configuration is parametrizable, so e.g. for the PDF format, different backends and different pipeline options can be used — see [Usage](../usage.md#adjust-pipeline-features). While the document converter holds a default mapping, this configuration is parametrizable, so e.g. for the PDF format, different backends and different pipeline options can be used — see [Usage](../usage/index.md#adjust-pipeline-features).
The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation. The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.

View File

@ -31,7 +31,7 @@ The first category is the _content items_, which are stored in these fields:
All of the above fields are lists and store items inheriting from the `DocItem` type. They can express different All of the above fields are lists and store items inheriting from the `DocItem` type. They can express different
data structures depending on their type, and reference parents and children through JSON pointers. data structures depending on their type, and reference parents and children through JSON pointers.
The second category is _content structure_, which is encapsualted in: The second category is _content structure_, which is encapsulated in:
- `body`: The root node of a tree-structure for the main document body - `body`: The root node of a tree-structure for the main document body
- `furniture`: The root node of a tree-structure for all items that don't belong into the body (headers, footers, ...) - `furniture`: The root node of a tree-structure for all items that don't belong into the body (headers, footers, ...)
@ -49,7 +49,7 @@ Below example shows how all items in the first page are nested below the `title`
### Grouping ### Grouping
Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as chilrden. The children of Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as children. The children of
"Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the "Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the
top-level `groups` field. top-level `groups` field.

View File

@ -80,7 +80,7 @@ for source in sources:
fp.write(json.dumps(res.document.export_to_dict())) fp.write(json.dumps(res.document.export_to_dict()))
res.document.save_as_json( res.document.save_as_json(
out_path / f"{res.input.file.stem}.md", out_path / f"{res.input.file.stem}.json",
image_mode=ImageRefMode.PLACEHOLDER, image_mode=ImageRefMode.PLACEHOLDER,
) )

View File

@ -13,6 +13,7 @@
[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
[![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT) [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
[![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
[![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/) [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem. Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.

2267
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -93,6 +93,7 @@ pluggy = "^1.0.0"
pylatexenc = "^2.10" pylatexenc = "^2.10"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
python = "^3.9.2"
black = { extras = ["jupyter"], version = "^24.4.2" } black = { extras = ["jupyter"], version = "^24.4.2" }
pytest = "^7.2.2" pytest = "^7.2.2"
pre-commit = "^3.7.1" pre-commit = "^3.7.1"

View File

@ -4,4 +4,7 @@ item-0 at level 0: unspecified: group _root_
item-3 at level 1: text: This is a regular paragraph. item-3 at level 1: text: This is a regular paragraph.
item-4 at level 1: text: This is a third div item-4 at level 1: text: This is a third div
with a new line. with a new line.
item-5 at level 1: text: This is a fourth div with a bold paragraph. item-5 at level 1: section: group details
item-6 at level 2: text: Heading for the details element
item-7 at level 2: text: Description of the details element.
item-8 at level 1: text: This is a fourth div with a bold paragraph.

View File

@ -4,7 +4,7 @@
"name": "example_06", "name": "example_06",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",
"binary_hash": 14574683870626799530, "binary_hash": 10224930410364781672,
"filename": "example_06.html" "filename": "example_06.html"
}, },
"furniture": { "furniture": {
@ -30,14 +30,35 @@
"$ref": "#/texts/3" "$ref": "#/texts/3"
}, },
{ {
"$ref": "#/texts/4" "$ref": "#/groups/0"
},
{
"$ref": "#/texts/6"
} }
], ],
"content_layer": "body", "content_layer": "body",
"name": "_root_", "name": "_root_",
"label": "unspecified" "label": "unspecified"
}, },
"groups": [], "groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "details",
"label": "section"
}
],
"texts": [ "texts": [
{ {
"self_ref": "#/texts/0", "self_ref": "#/texts/0",
@ -89,6 +110,30 @@
}, },
{ {
"self_ref": "#/texts/4", "self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Heading for the details element",
"text": "Heading for the details element"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Description of the details element.",
"text": "Description of the details element."
},
{
"self_ref": "#/texts/6",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/body"
}, },

View File

@ -7,4 +7,8 @@ This is a regular paragraph.
This is a third div This is a third div
with a new line. with a new line.
Heading for the details element
Description of the details element.
This is a fourth div with a bold paragraph. This is a fourth div with a bold paragraph.

View File

@ -7,6 +7,10 @@
<div>This is another div with text.</div> <div>This is another div with text.</div>
<p>This is a regular paragraph.</p> <p>This is a regular paragraph.</p>
<div>This is a third div<br/>with a new line.</div> <div>This is a third div<br/>with a new line.</div>
<details>
<summary>Heading for the details element</summary>
<p>Description of the details element.</p>
</details>
<div><p>This is a fourth div with a <b>bold</b> paragraph.</p></div> <div><p>This is a fourth div with a <b>bold</b> paragraph.</p></div>
</body> </body>
</html> </html>