fix(html): slow table parsing (#2582)

* fix(html): simplify parsing of simple table cells Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(html): add test for rich table cells Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(html): ensure table cells with formatted text are parsed as RichTableCell Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): simplify process_rich_table_cells since only rich cells are processed Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(html): formatted cell runs should be parsed as text items respecting the order Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: pin latest docling-core and update uv.lock Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: upgrade dependencies on uv.lock Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-11-06 05:25:36 +01:00
parent 8da3d287ed
commit 0ba8d5d9e3
11 changed files with 9503 additions and 6544 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -354,32 +354,51 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    ) -> tuple[bool, Union[RefItem, None]]:
        rich_table_cell = False
        ref_for_rich_cell = None
-        if len(provs_in_cell) > 0:
-            ref_for_rich_cell = provs_in_cell[0]
-        if len(provs_in_cell) > 1:
-            # Cell has multiple elements, we need to group them
+        if len(provs_in_cell) >= 1:
+            # Cell rich cell has multiple elements, we need to group them
            rich_table_cell = True
            ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
                group_name, doc, provs_in_cell, docling_table
            )
-        elif len(provs_in_cell) == 1:
-            item_ref = provs_in_cell[0]
-            pr_item = item_ref.resolve(doc)
-            if isinstance(pr_item, TextItem):
-                # Cell has only one element and it's just a text
-                rich_table_cell = False
-                try:
-                    doc.delete_items(node_items=[pr_item])
-                except Exception as e:
-                    _log.error(f"Error while making rich table: {e}.")
-            else:
-                rich_table_cell = True
-                ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
-                    group_name, doc, provs_in_cell, docling_table
-                )

        return rich_table_cell, ref_for_rich_cell

+    def _is_rich_table_cell(self, table_cell: Tag) -> bool:
+        """Determine whether an table cell should be parsed as a Docling RichTableCell.
+
+        A table cell can hold rich content and be parsed with a Docling RichTableCell.
+        However, this requires walking through the content elements and creating
+        Docling node items. If the cell holds only plain text, the parsing is simpler
+        and using a TableCell is prefered.
+
+        Args:
+            table_cell: The HTML tag representing a table cell.
+
+        Returns:
+            Whether the cell should be parsed as RichTableCell.
+        """
+        is_rich: bool = True
+
+        children = table_cell.find_all(recursive=True)  # all descendants of type Tag
+        if not children:
+            content = [
+                item
+                for item in table_cell.contents
+                if isinstance(item, NavigableString)
+            ]
+            is_rich = len(content) > 1
+        else:
+            annotations = self._extract_text_and_hyperlink_recursively(
+                table_cell, find_parent_annotation=True
+            )
+            if not annotations:
+                is_rich = bool(item for item in children if item.name == "img")
+            elif len(annotations) == 1:
+                anno: AnnotatedText = annotations[0]
+                is_rich = bool(anno.formatting) or bool(anno.hyperlink) or anno.code
+
+        return is_rich
+
    def parse_table_data(
        self,
        element: Tag,
@@ -437,23 +456,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                        formula.replace_with(NavigableString(math_formula))

                provs_in_cell: list[RefItem] = []
-                # Parse table cell sub-tree for Rich Cells content:
-                table_level = self.level
-                provs_in_cell = self._walk(html_cell, doc)
-                # After walking sub-tree in cell, restore previously set level
-                self.level = table_level
+                rich_table_cell = self._is_rich_table_cell(html_cell)
+                if rich_table_cell:
+                    # Parse table cell sub-tree for Rich Cells content:
+                    table_level = self.level
+                    provs_in_cell = self._walk(html_cell, doc)
+                    # After walking sub-tree in cell, restore previously set level
+                    self.level = table_level

-                rich_table_cell = False
-                ref_for_rich_cell = None
-                group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
-                rich_table_cell, ref_for_rich_cell = (
-                    HTMLDocumentBackend.process_rich_table_cells(
-                        provs_in_cell, group_name, doc, docling_table
+                    group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
+                    rich_table_cell, ref_for_rich_cell = (
+                        HTMLDocumentBackend.process_rich_table_cells(
+                            provs_in_cell, group_name, doc, docling_table
+                        )
                    )
-                )

                # Extracting text
-                text = self.get_text(html_cell).strip()
+                text = HTMLDocumentBackend._clean_unicode(
+                    self.get_text(html_cell).strip()
+                )
                col_span, row_span = self._get_cell_spans(html_cell)
                if row_header:
                    row_span -= 1
@@ -555,6 +576,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    if im_ref3:
                        added_refs.append(im_ref3)
                elif name in _FORMAT_TAG_MAP:
+                    flush_buffer()
                    with self._use_format([name]):
                        wk = self._walk(node, doc)
                        added_refs.extend(wk)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ authors = [
 requires-python = '>=3.9,<4.0'
 dependencies = [
  'pydantic (>=2.0.0,<3.0.0)',
-  'docling-core[chunking] (>=2.48.2,<3.0.0)',
+  'docling-core[chunking] (>=2.50.1,<3.0.0)',
  'docling-parse (>=4.7.0,<5.0.0)',
  "docling-ibm-models>=3.9.1,<4",
  'filetype (>=1.2.0,<2.0.0)',
--- a/tests/data/groundtruth/docling_v2/html_rich_table_cells.html.itxt
+++ b/tests/data/groundtruth/docling_v2/html_rich_table_cells.html.itxt
@@ -0,0 +1,53 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: title: Rich Table Cells in HTML
+    item-2 at level 2: table with [5x3]
+      item-3 at level 3: unspecified: group rich_cell_group_1_1_3
+        item-4 at level 4: text: Large
+        item-5 at level 4: text: ,
+        item-6 at level 4: text: loud
+        item-7 at level 4: text: ,
+        item-8 at level 4: text: noisy
+        item-9 at level 4: text: ,
+        item-10 at level 4: text: small
+      item-11 at level 3: unspecified: group rich_cell_group_1_0_4
+        item-12 at level 4: list: group list
+          item-13 at level 5: list_item: Pond
+          item-14 at level 5: list_item: Marsh
+          item-15 at level 5: list_item: Riverbank
+      item-16 at level 3: unspecified: group rich_cell_group_1_1_4
+        item-17 at level 4: list: group ordered list
+          item-18 at level 5: list_item: Fly south in winter
+          item-19 at level 5: list_item: Build nest on ground
+    item-20 at level 2: table with [4x2]
+      item-21 at level 3: unspecified: group rich_cell_group_2_0_1
+        item-22 at level 4: text: Aythya
+        item-23 at level 4: text: (Diving ducks)
+      item-24 at level 3: unspecified: group rich_cell_group_2_0_2
+        item-25 at level 4: text: Lophonetta
+        item-26 at level 4: text: (Pintail group)
+      item-27 at level 3: unspecified: group rich_cell_group_2_0_3
+        item-28 at level 4: text: Oxyura
+        item-29 at level 4: text: (Benthic ducks)
+    item-30 at level 2: table with [4x2]
+      item-31 at level 3: unspecified: group rich_cell_group_3_0_1
+        item-32 at level 4: text: Swim
+      item-33 at level 3: unspecified: group rich_cell_group_3_0_1
+        item-34 at level 4: text: Gracefully glide on H
+        item-35 at level 4: text: 2
+        item-36 at level 4: text: O surfaces.
+      item-37 at level 3: unspecified: group rich_cell_group_3_0_2
+        item-38 at level 4: text: Fly
+      item-39 at level 3: unspecified: group rich_cell_group_3_0_3
+        item-40 at level 4: text: Quack
+      item-41 at level 3: unspecified: group rich_cell_group_4_0_3
+        item-42 at level 4: table with [3x2]
+    item-43 at level 2: table with [5x3]
+      item-44 at level 3: unspecified: group rich_cell_group_5_1_1
+        item-45 at level 4: text: View PNG
+      item-46 at level 3: unspecified: group rich_cell_group_5_1_2
+        item-47 at level 4: picture
+          item-47 at level 5: caption: White-headed duck thumbnail
+      item-48 at level 3: unspecified: group rich_cell_group_5_1_3
+        item-49 at level 4: text: View Full-Size Image
+    item-50 at level 2: picture
+  item-51 at level 1: caption: White-headed duck thumbnail
--- a/tests/data/groundtruth/docling_v2/html_rich_table_cells.html.json
+++ b/tests/data/groundtruth/docling_v2/html_rich_table_cells.html.json
--- a/tests/data/groundtruth/docling_v2/html_rich_table_cells.html.md
+++ b/tests/data/groundtruth/docling_v2/html_rich_table_cells.html.md
@@ -0,0 +1,29 @@
+# Rich Table Cells in HTML
+
+| Name                | Habitat                    | Comment                                        |
+|---------------------|----------------------------|------------------------------------------------|
+| Wood Duck           |                            | Often seen near ponds.                         |
+| Mallard             | Ponds, lakes, rivers       | Quack                                          |
+| Goose (not a duck!) | Water & wetlands           | **Large**  ,  *loud*  ,  noisy  ,  ~~small~~   |
+| Teal                | - Pond - Marsh - Riverbank | 1. Fly south in winter 2. Build nest on ground |
+
+| Genus                       | Species                   |
+|-----------------------------|---------------------------|
+| Aythya  (Diving ducks)      | Hawser, Common Pochard    |
+| Lophonetta  (Pintail group) | Fulvous Whistling Duck    |
+| Oxyura  (Benthic ducks)     | Wigee, Banded Water-screw |
+
+| Action   | Action                                                                                                  |
+|----------|---------------------------------------------------------------------------------------------------------|
+| **Swim** | Gracefully glide on H  2  O surfaces.                                                                   |
+| *Fly*    |                                                                                                         |
+| Quack    | | Type   | Sound        | |--------|--------------| | Short  | "quak"       | | Long   | "quaaaaaack" | |
+
+| Name              | Description                                  | Image                                                                                                                                                                     |
+|-------------------|----------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Donald Duck       | Cartoon character.                           | [View PNG](https://en.wikipedia.org/wiki/Donald_Duck#/media/File:Donald_Duck_angry_transparent_background.png)                                                            |
+| White-headed duck | A small diving duck some 45 cm (18 in) long. | White-headed duck thumbnail  <!-- image -->                                                                                                                               |
+| Mandarin Duck     | Known for its striking plumage.              | [View Full-Size Image](https://upload.wikimedia.org/wikipedia/commons/thumb/7/75/Mandarin_duck_%28Aix_galericulata%29.jpg/250px-Mandarin_duck_%28Aix_galericulata%29.jpg) |
+| Unknown Duck      | No photo available.                          |                                                                                                                                                                           |
+
+<!-- image -->
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
--- a/tests/data/html/html_rich_table_cells.html
+++ b/tests/data/html/html_rich_table_cells.html
@@ -0,0 +1,167 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<title>Rich Table Cells in HTML</title>
+<style>
+  table { border-collapse: collapse; width: 90%; margin: 1em auto; }
+  th, td { border: 1px solid #aaa; padding: 0.5rem; text-align: left; vertical-align: top; }
+  th { background:#f2f2f2; }
+</style>
+</head>
+
+<body>
+
+<h1>Rich Table Cells in HTML</h1>
+
+<!-- Simple data table -->
+<table>
+  <caption>Basic duck facts</caption>
+  <thead>
+    <tr><th>Name</th><th>Habitat</th><th>Comment</th></tr>
+  </thead>
+  <tbody>
+    <!-- empty cell -->
+    <tr><td>Wood Duck</td><td>&nbsp;</td><td>Often seen near ponds.</td></tr>
+
+    <!-- plain text -->
+    <tr><td>Mallard</td><td>Ponds, lakes, rivers</td><td>Quack</td></tr>
+
+    <!-- formatted text -->
+    <tr>
+      <td>Goose (not a duck!)</td>
+      <td style="color:#777;">Water & wetlands</td>
+      <td><strong>Large</strong>, <em>loud</em>, <u>noisy</u>, <s>small</s></td>
+    </tr>
+
+    <!-- list -->
+    <tr>
+      <td>Teal</td>
+      <td>
+        <ul style="margin:0;padding-left:1.2rem;">
+          <li>Pond</li>
+          <li>Marsh</li>
+          <li>Riverbank</li>
+        </ul>
+      </td>
+      <td>
+        <ol style="margin:0;padding-left:1.2rem;">
+          <li>Fly south in winter</li>
+          <li>Build nest on ground</li>
+        </ol>
+      </td>
+    </tr>
+  </tbody>
+</table>
+
+<!-- Table with mixed cell content -->
+<table>
+  <caption>Duck family tree (simplified)</caption>
+  <thead>
+    <tr><th>Genus</th><th>Species</th></tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Aythya<br><small>(Diving ducks)</small></td>
+      <td>Hawser, Common Pochard</td>
+    </tr>
+    <tr>
+      <td>Lophonetta<br><small>(Pintail group)</small></td>
+      <td>Fulvous Whistling Duck</td>
+    </tr>
+    <tr>
+      <td>Oxyura<br><small>(Benthic ducks)</small></td>
+      <td>Wigee, Banded Water‑screw</td>
+    </tr>
+  </tbody>
+</table>
+
+<!-- Table with a mix of cell types and a nested table -->
+<table>
+  <caption>Duck‑related actions</caption>
+  <thead>
+    <tr style="background:#cce5ff;">
+      <th colspan="2">Action</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><strong>Swim</strong></td>
+      <td>Gracefully glide on H<sub>2</sub>O surfaces.</td>
+    </tr>
+    <tr>
+      <td><em>Fly</em></td>
+      <td>&nbsp;</td> <!-- empty cell -->
+    </tr>
+    <tr>
+      <td><u>Quack</u></td>
+      <td>
+        <table>
+          <thead>
+            <tr><th>Type</th><th>Sound</th></tr>
+          </thead>
+          <tbody>
+            <tr>
+              <td>Short</td>
+              <td>“quak”</td>
+            </tr>
+            <tr>
+              <td>Long</td>
+              <td>“quaaaaaack”</td>
+            </tr>
+          </tbody>
+        </table>
+      </td>
+    </tr>
+  </tbody>
+</table>
+
+<!-- Table with links -->
+<table>
+  <caption>Famous Ducks with Images</caption>
+  <thead>
+    <tr><th>Name</th><th>Description</th><th>Image</th></tr>
+  </thead>
+  <tbody>
+    <!-- Plain link to a PNG/JPG file -->
+    <tr>
+      <td>Donald Duck</td>
+      <td>Cartoon character.</td>
+      <td><a href="https://en.wikipedia.org/wiki/Donald_Duck#/media/File:Donald_Duck_angry_transparent_background.png" target="_blank">View PNG</a></td>
+    </tr>
+
+    <!-- Thumbnail image that opens in a new tab -->
+    <tr>
+      <td>White-headed duck</td>
+      <td>A small diving duck some 45 cm (18 in) long.</td>
+      <td>
+        <a href="https://upload.wikimedia.org/wikipedia/commons/thumb/a/ab/Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif/lossy-page1-1920px-Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif.jpg" target="_blank">
+          <img class="thumb"
+               src="https://upload.wikimedia.org/wikipedia/commons/thumb/a/ab/Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif/lossy-page1-250px-Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif.jpg"
+               alt="White-headed duck thumbnail">
+        </a>
+      </td>
+    </tr>
+
+    <!-- Link to a larger image with a caption -->
+    <tr>
+      <td>Mandarin Duck</td>
+      <td>Known for its striking plumage.</td>
+      <td>
+        <a href="https://upload.wikimedia.org/wikipedia/commons/thumb/7/75/Mandarin_duck_%28Aix_galericulata%29.jpg/250px-Mandarin_duck_%28Aix_galericulata%29.jpg" target="_blank">
+          View Full‑Size Image
+        </a>
+      </td>
+    </tr>
+
+    <!-- Empty image cell (to illustrate the empty case) -->
+    <tr>
+      <td>Unknown Duck</td>
+      <td>No photo available.</td>
+      <td>&nbsp;</td>
+    </tr>
+  </tbody>
+</table>
+
+</body>
+</html>
--- a/tests/test_backend_html.py
+++ b/tests/test_backend_html.py
@@ -205,12 +205,14 @@ def test_extract_parent_hyperlinks():
    assert str(annotated_text_list[0].hyperlink) == a_tag.get("href")


-def get_html_paths():
+@pytest.fixture(scope="module")
+def html_paths() -> list[Path]:
    # Define the directory you want to search
    directory = Path("./tests/data/html/")

    # List all HTML files in the directory and its subdirectories
    html_files = sorted(directory.rglob("*.html"))
+
    return html_files


@@ -220,8 +222,7 @@ def get_converter():
    return converter


-def test_e2e_html_conversions():
-    html_paths = get_html_paths()
+def test_e2e_html_conversions(html_paths):
    converter = get_converter()

    for html_path in html_paths:
@@ -441,3 +442,84 @@ def test_fetch_remote_images(monkeypatch):
            "tests/data/html/example_image_01.png", "rb"
        )
        assert res.document
+
+
+def test_is_rich_table_cell(html_paths):
+    """Test the function is_rich_table_cell."""
+
+    name = "html_rich_table_cells.html"
+    path = next(item for item in html_paths if item.name == name)
+
+    in_doc = InputDocument(
+        path_or_stream=path,
+        format=InputFormat.HTML,
+        backend=HTMLDocumentBackend,
+        filename=name,
+    )
+    backend = HTMLDocumentBackend(
+        in_doc=in_doc,
+        path_or_stream=path,
+    )
+
+    gt_cells: dict[int, list[bool]] = {}
+    # table: Basic duck facts
+    gt_cells[0] = [
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        True,
+        False,
+        True,
+        True,
+    ]
+    # table: Duck family tree
+    gt_cells[1] = [False, False, True, False, True, False, True, False]
+    # table: Duck-related actions
+    gt_cells[2] = [False, True, True, True, False, True, True]
+    # table: nested table
+    gt_cells[3] = [False, False, False, False, False, False]
+    # table: Famous Ducks with Images
+    gt_cells[4] = [
+        False,
+        False,
+        False,
+        False,
+        False,
+        True,
+        False,
+        False,
+        True,
+        False,
+        False,
+        True,
+        False,
+        False,
+        False,
+    ]
+
+    for idx_t, table in enumerate(backend.soup.find_all("table")):
+        gt_it = iter(gt_cells[idx_t])
+        num_cells = 0
+        containers = table.find_all(["thead", "tbody"], recursive=False)
+        for part in containers:
+            for idx_r, row in enumerate(part.find_all("tr", recursive=False)):
+                cells = row.find_all(["td", "th"], recursive=False)
+                if not cells:
+                    continue
+                for idx_c, cell in enumerate(cells):
+                    assert next(gt_it) == backend._is_rich_table_cell(cell), (
+                        f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
+                        f"with text: {cell.text}"
+                    )
+                    num_cells += 1
+        assert num_cells == len(gt_cells[idx_t]), (
+            f"Cell number does not match in table {idx_t}"
+        )
--- a/uv.lock
+++ b/uv.lock