fix(html): slow table parsing (#2582)

* fix(html): simplify parsing of simple table cells Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(html): add test for rich table cells Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(html): ensure table cells with formatted text are parsed as RichTableCell Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): simplify process_rich_table_cells since only rich cells are processed Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(html): formatted cell runs should be parsed as text items respecting the order Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: pin latest docling-core and update uv.lock Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: upgrade dependencies on uv.lock Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-11-06 05:25:36 +01:00
parent 8da3d287ed
commit 0ba8d5d9e3
11 changed files with 9503 additions and 6544 deletions
--- a/tests/test_backend_html.py
+++ b/tests/test_backend_html.py
@@ -205,12 +205,14 @@ def test_extract_parent_hyperlinks():
    assert str(annotated_text_list[0].hyperlink) == a_tag.get("href")


-def get_html_paths():
+@pytest.fixture(scope="module")
+def html_paths() -> list[Path]:
    # Define the directory you want to search
    directory = Path("./tests/data/html/")

    # List all HTML files in the directory and its subdirectories
    html_files = sorted(directory.rglob("*.html"))
+
    return html_files


@@ -220,8 +222,7 @@ def get_converter():
    return converter


-def test_e2e_html_conversions():
-    html_paths = get_html_paths()
+def test_e2e_html_conversions(html_paths):
    converter = get_converter()

    for html_path in html_paths:
@@ -441,3 +442,84 @@ def test_fetch_remote_images(monkeypatch):
            "tests/data/html/example_image_01.png", "rb"
        )
        assert res.document
+
+
+def test_is_rich_table_cell(html_paths):
+    """Test the function is_rich_table_cell."""
+
+    name = "html_rich_table_cells.html"
+    path = next(item for item in html_paths if item.name == name)
+
+    in_doc = InputDocument(
+        path_or_stream=path,
+        format=InputFormat.HTML,
+        backend=HTMLDocumentBackend,
+        filename=name,
+    )
+    backend = HTMLDocumentBackend(
+        in_doc=in_doc,
+        path_or_stream=path,
+    )
+
+    gt_cells: dict[int, list[bool]] = {}
+    # table: Basic duck facts
+    gt_cells[0] = [
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        True,
+        False,
+        True,
+        True,
+    ]
+    # table: Duck family tree
+    gt_cells[1] = [False, False, True, False, True, False, True, False]
+    # table: Duck-related actions
+    gt_cells[2] = [False, True, True, True, False, True, True]
+    # table: nested table
+    gt_cells[3] = [False, False, False, False, False, False]
+    # table: Famous Ducks with Images
+    gt_cells[4] = [
+        False,
+        False,
+        False,
+        False,
+        False,
+        True,
+        False,
+        False,
+        True,
+        False,
+        False,
+        True,
+        False,
+        False,
+        False,
+    ]
+
+    for idx_t, table in enumerate(backend.soup.find_all("table")):
+        gt_it = iter(gt_cells[idx_t])
+        num_cells = 0
+        containers = table.find_all(["thead", "tbody"], recursive=False)
+        for part in containers:
+            for idx_r, row in enumerate(part.find_all("tr", recursive=False)):
+                cells = row.find_all(["td", "th"], recursive=False)
+                if not cells:
+                    continue
+                for idx_c, cell in enumerate(cells):
+                    assert next(gt_it) == backend._is_rich_table_cell(cell), (
+                        f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
+                        f"with text: {cell.text}"
+                    )
+                    num_cells += 1
+        assert num_cells == len(gt_cells[idx_t]), (
+            f"Cell number does not match in table {idx_t}"
+        )