Merge branch 'main' into nli/layoutmodel_improvements

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-04-24 10:03:05 +02:00 · 2025-04-24 10:03:05 +02:00 · a553a1e5bf
commit a553a1e5bf
parent 4ce338f455 976431ed7f
15 changed files with 1595 additions and 788 deletions
--- a/.github/codecov.yml
+++ b/.github/codecov.yml
@ -7,9 +7,10 @@ coverage:
      default:
        informational: true
        target: auto  # auto compares coverage to the previous base commit
        if_ci_failed: success
        flags:
          - docling
-  comment:
+comment:
  layout: "reach, diff, flags, files"
  behavior: default
  require_changes: false  # if true: only post the comment if coverage changes
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@ -46,7 +46,7 @@ jobs:
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
-          file: ./coverage.xml
+          files: ./coverage.xml
      - name: Run examples
        run: |
          for file in docs/examples/*.py; do
--- a/README.md
+++ b/README.md
@ -22,6 +22,7 @@
 [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
 [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
 [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
 [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
 [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
 Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -26,6 +26,8 @@ _log = logging.getLogger(__name__)
 # tags that generate NodeItem elements
 TAGS_FOR_NODE_ITEMS: Final = [
    "address",
    "details",
    "h1",
    "h2",
    "h3",
@ -38,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
    "ul",
    "ol",
    "li",
    "summary",
    "table",
    "figure",
    "img",
@ -163,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
        if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            self.handle_header(tag, doc)
-        elif tag.name in ["p"]:
+        elif tag.name in ["p", "address", "summary"]:
            self.handle_paragraph(tag, doc)
        elif tag.name in ["pre", "code"]:
            self.handle_code(tag, doc)
@ -177,6 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            self.handle_figure(tag, doc)
        elif tag.name == "img":
            self.handle_image(tag, doc)
        elif tag.name == "details":
            self.handle_details(tag, doc)
        else:
            self.walk(tag, doc)
@ -201,6 +206,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        return ["".join(result) + " "]
    def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
        """Handle details tag (details) and its content."""
        self.parents[self.level + 1] = doc.add_group(
            name="details",
            label=GroupLabel.SECTION,
            parent=self.parents[self.level],
            content_layer=self.content_layer,
        )
        self.level += 1
        self.walk(element, doc)
        self.parents[self.level + 1] = None
        self.level -= 1
    def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles header tags (h1, h2, etc.)."""
        hlevel = int(element.name.replace("h", ""))
@ -258,7 +278,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            )
    def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles paragraph tags (p)."""
+        """Handles paragraph tags (p) or equivalent ones."""
        if element.text is None:
            return
        text = element.text.strip()
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -421,7 +421,7 @@ def convert(  # noqa: C901
        logging.basicConfig(level=logging.WARNING)
    elif verbose == 1:
        logging.basicConfig(level=logging.INFO)
-    elif verbose == 2:
+    else:
        logging.basicConfig(level=logging.DEBUG)
    settings.debug.visualize_cells = debug_visualize_cells
--- a/docs/concepts/architecture.md
+++ b/docs/concepts/architecture.md
@ -6,7 +6,7 @@ For each document format, the *document converter* knows which format-specific *
 !!! tip
-    While the document converter holds a default mapping, this configuration is parametrizable, so e.g. for the PDF format, different backends and different pipeline options can be used — see [Usage](../usage.md#adjust-pipeline-features).
+    While the document converter holds a default mapping, this configuration is parametrizable, so e.g. for the PDF format, different backends and different pipeline options can be used — see [Usage](../usage/index.md#adjust-pipeline-features).
 The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.
--- a/docs/concepts/docling_document.md
+++ b/docs/concepts/docling_document.md
@ -31,7 +31,7 @@ The first category is the _content items_, which are stored in these fields:
 All of the above fields are lists and store items inheriting from the `DocItem` type. They can express different
 data structures depending on their type, and reference parents and children through JSON pointers.
-The second category is _content structure_, which is encapsualted in:
+The second category is _content structure_, which is encapsulated in:
 - `body`: The root node of a tree-structure for the main document body
 - `furniture`: The root node of a tree-structure for all items that don't belong into the body (headers, footers, ...)
@ -49,7 +49,7 @@ Below example shows how all items in the first page are nested below the `title`
 ### Grouping
-Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as chilrden. The children of
+Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as children. The children of
 "Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the
 top-level `groups` field.
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -80,7 +80,7 @@ for source in sources:
        fp.write(json.dumps(res.document.export_to_dict()))
    res.document.save_as_json(
-        out_path / f"{res.input.file.stem}.md",
+        out_path / f"{res.input.file.stem}.json",
        image_mode=ImageRefMode.PLACEHOLDER,
    )
--- a/docs/index.md
+++ b/docs/index.md
@ -13,6 +13,7 @@
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
 [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
 [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
 [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
 Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -93,6 +93,7 @@ pluggy = "^1.0.0"
 pylatexenc = "^2.10"
 [tool.poetry.group.dev.dependencies]
 python = "^3.9.2"
 black = { extras = ["jupyter"], version = "^24.4.2" }
 pytest = "^7.2.2"
 pre-commit = "^3.7.1"
--- a/tests/data/groundtruth/docling_v2/example_06.html.itxt
+++ b/tests/data/groundtruth/docling_v2/example_06.html.itxt
@ -4,4 +4,7 @@ item-0 at level 0: unspecified: group _root_
  item-3 at level 1: text: This is a regular paragraph.
  item-4 at level 1: text: This is a third div
 with a new line.
-  item-5 at level 1: text: This is a fourth div with a bold paragraph.
+  item-5 at level 1: section: group details
    item-6 at level 2: text: Heading for the details element
    item-7 at level 2: text: Description of the details element.
  item-8 at level 1: text: This is a fourth div with a bold paragraph.
--- a/tests/data/groundtruth/docling_v2/example_06.html.json
+++ b/tests/data/groundtruth/docling_v2/example_06.html.json
@ -4,7 +4,7 @@
  "name": "example_06",
  "origin": {
    "mimetype": "text/html",
-    "binary_hash": 14574683870626799530,
+    "binary_hash": 10224930410364781672,
    "filename": "example_06.html"
  },
  "furniture": {
@ -30,14 +30,35 @@
        "$ref": "#/texts/3"
      },
      {
-        "$ref": "#/texts/4"
+        "$ref": "#/groups/0"
      },
      {
        "$ref": "#/texts/6"
      }
    ],
    "content_layer": "body",
    "name": "_root_",
    "label": "unspecified"
  },
-  "groups": [],
+  "groups": [
    {
      "self_ref": "#/groups/0",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/4"
        },
        {
          "$ref": "#/texts/5"
        }
      ],
      "content_layer": "body",
      "name": "details",
      "label": "section"
    }
  ],
  "texts": [
    {
      "self_ref": "#/texts/0",
@ -89,6 +110,30 @@
    },
    {
      "self_ref": "#/texts/4",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Heading for the details element",
      "text": "Heading for the details element"
    },
    {
      "self_ref": "#/texts/5",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Description of the details element.",
      "text": "Description of the details element."
    },
    {
      "self_ref": "#/texts/6",
      "parent": {
        "$ref": "#/body"
      },
--- a/tests/data/groundtruth/docling_v2/example_06.html.md
+++ b/tests/data/groundtruth/docling_v2/example_06.html.md
@ -7,4 +7,8 @@ This is a regular paragraph.
 This is a third div
 with a new line.
 Heading for the details element
 Description of the details element.
 This is a fourth div with a bold paragraph.
--- a/tests/data/html/example_06.html
+++ b/tests/data/html/example_06.html
@ -7,6 +7,10 @@
    <div>This is another div with text.</div>
    <p>This is a regular paragraph.</p>
    <div>This is a third div<br/>with a new line.</div>
    <details>
        <summary>Heading for the details element</summary>
        <p>Description of the details element.</p>
    </details> 
    <div><p>This is a fourth div with a <b>bold</b> paragraph.</p></div>
 </body>
 </html>