fix(html): slow table parsing (#2582)

* fix(html): simplify parsing of simple table cells

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* tests(html): add test for rich table cells

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(html): ensure table cells with formatted text are parsed as RichTableCell

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* refactor(html): simplify process_rich_table_cells since only rich cells are processed

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(html): formatted cell runs should be parsed as text items respecting the order

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore: pin latest docling-core and update uv.lock

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore: upgrade dependencies on uv.lock

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-11-06 05:25:36 +01:00
committed by GitHub
parent 8da3d287ed
commit 0ba8d5d9e3
11 changed files with 9503 additions and 6544 deletions

View File

@@ -354,32 +354,51 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
) -> tuple[bool, Union[RefItem, None]]:
rich_table_cell = False
ref_for_rich_cell = None
if len(provs_in_cell) > 0:
ref_for_rich_cell = provs_in_cell[0]
if len(provs_in_cell) > 1:
# Cell has multiple elements, we need to group them
if len(provs_in_cell) >= 1:
# Cell rich cell has multiple elements, we need to group them
rich_table_cell = True
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
group_name, doc, provs_in_cell, docling_table
)
elif len(provs_in_cell) == 1:
item_ref = provs_in_cell[0]
pr_item = item_ref.resolve(doc)
if isinstance(pr_item, TextItem):
# Cell has only one element and it's just a text
rich_table_cell = False
try:
doc.delete_items(node_items=[pr_item])
except Exception as e:
_log.error(f"Error while making rich table: {e}.")
else:
rich_table_cell = True
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
group_name, doc, provs_in_cell, docling_table
)
return rich_table_cell, ref_for_rich_cell
def _is_rich_table_cell(self, table_cell: Tag) -> bool:
"""Determine whether an table cell should be parsed as a Docling RichTableCell.
A table cell can hold rich content and be parsed with a Docling RichTableCell.
However, this requires walking through the content elements and creating
Docling node items. If the cell holds only plain text, the parsing is simpler
and using a TableCell is prefered.
Args:
table_cell: The HTML tag representing a table cell.
Returns:
Whether the cell should be parsed as RichTableCell.
"""
is_rich: bool = True
children = table_cell.find_all(recursive=True) # all descendants of type Tag
if not children:
content = [
item
for item in table_cell.contents
if isinstance(item, NavigableString)
]
is_rich = len(content) > 1
else:
annotations = self._extract_text_and_hyperlink_recursively(
table_cell, find_parent_annotation=True
)
if not annotations:
is_rich = bool(item for item in children if item.name == "img")
elif len(annotations) == 1:
anno: AnnotatedText = annotations[0]
is_rich = bool(anno.formatting) or bool(anno.hyperlink) or anno.code
return is_rich
def parse_table_data(
self,
element: Tag,
@@ -437,23 +456,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
formula.replace_with(NavigableString(math_formula))
provs_in_cell: list[RefItem] = []
# Parse table cell sub-tree for Rich Cells content:
table_level = self.level
provs_in_cell = self._walk(html_cell, doc)
# After walking sub-tree in cell, restore previously set level
self.level = table_level
rich_table_cell = self._is_rich_table_cell(html_cell)
if rich_table_cell:
# Parse table cell sub-tree for Rich Cells content:
table_level = self.level
provs_in_cell = self._walk(html_cell, doc)
# After walking sub-tree in cell, restore previously set level
self.level = table_level
rich_table_cell = False
ref_for_rich_cell = None
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
rich_table_cell, ref_for_rich_cell = (
HTMLDocumentBackend.process_rich_table_cells(
provs_in_cell, group_name, doc, docling_table
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
rich_table_cell, ref_for_rich_cell = (
HTMLDocumentBackend.process_rich_table_cells(
provs_in_cell, group_name, doc, docling_table
)
)
)
# Extracting text
text = self.get_text(html_cell).strip()
text = HTMLDocumentBackend._clean_unicode(
self.get_text(html_cell).strip()
)
col_span, row_span = self._get_cell_spans(html_cell)
if row_header:
row_span -= 1
@@ -555,6 +576,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if im_ref3:
added_refs.append(im_ref3)
elif name in _FORMAT_TAG_MAP:
flush_buffer()
with self._use_format([name]):
wk = self._walk(node, doc)
added_refs.extend(wk)

View File

@@ -45,7 +45,7 @@ authors = [
requires-python = '>=3.9,<4.0'
dependencies = [
'pydantic (>=2.0.0,<3.0.0)',
'docling-core[chunking] (>=2.48.2,<3.0.0)',
'docling-core[chunking] (>=2.50.1,<3.0.0)',
'docling-parse (>=4.7.0,<5.0.0)',
"docling-ibm-models>=3.9.1,<4",
'filetype (>=1.2.0,<2.0.0)',

View File

@@ -0,0 +1,53 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Rich Table Cells in HTML
item-2 at level 2: table with [5x3]
item-3 at level 3: unspecified: group rich_cell_group_1_1_3
item-4 at level 4: text: Large
item-5 at level 4: text: ,
item-6 at level 4: text: loud
item-7 at level 4: text: ,
item-8 at level 4: text: noisy
item-9 at level 4: text: ,
item-10 at level 4: text: small
item-11 at level 3: unspecified: group rich_cell_group_1_0_4
item-12 at level 4: list: group list
item-13 at level 5: list_item: Pond
item-14 at level 5: list_item: Marsh
item-15 at level 5: list_item: Riverbank
item-16 at level 3: unspecified: group rich_cell_group_1_1_4
item-17 at level 4: list: group ordered list
item-18 at level 5: list_item: Fly south in winter
item-19 at level 5: list_item: Build nest on ground
item-20 at level 2: table with [4x2]
item-21 at level 3: unspecified: group rich_cell_group_2_0_1
item-22 at level 4: text: Aythya
item-23 at level 4: text: (Diving ducks)
item-24 at level 3: unspecified: group rich_cell_group_2_0_2
item-25 at level 4: text: Lophonetta
item-26 at level 4: text: (Pintail group)
item-27 at level 3: unspecified: group rich_cell_group_2_0_3
item-28 at level 4: text: Oxyura
item-29 at level 4: text: (Benthic ducks)
item-30 at level 2: table with [4x2]
item-31 at level 3: unspecified: group rich_cell_group_3_0_1
item-32 at level 4: text: Swim
item-33 at level 3: unspecified: group rich_cell_group_3_0_1
item-34 at level 4: text: Gracefully glide on H
item-35 at level 4: text: 2
item-36 at level 4: text: O surfaces.
item-37 at level 3: unspecified: group rich_cell_group_3_0_2
item-38 at level 4: text: Fly
item-39 at level 3: unspecified: group rich_cell_group_3_0_3
item-40 at level 4: text: Quack
item-41 at level 3: unspecified: group rich_cell_group_4_0_3
item-42 at level 4: table with [3x2]
item-43 at level 2: table with [5x3]
item-44 at level 3: unspecified: group rich_cell_group_5_1_1
item-45 at level 4: text: View PNG
item-46 at level 3: unspecified: group rich_cell_group_5_1_2
item-47 at level 4: picture
item-47 at level 5: caption: White-headed duck thumbnail
item-48 at level 3: unspecified: group rich_cell_group_5_1_3
item-49 at level 4: text: View Full-Size Image
item-50 at level 2: picture
item-51 at level 1: caption: White-headed duck thumbnail

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,29 @@
# Rich Table Cells in HTML
| Name | Habitat | Comment |
|---------------------|----------------------------|------------------------------------------------|
| Wood Duck | | Often seen near ponds. |
| Mallard | Ponds, lakes, rivers | Quack |
| Goose (not a duck!) | Water & wetlands | **Large** , *loud* , noisy , ~~small~~ |
| Teal | - Pond - Marsh - Riverbank | 1. Fly south in winter 2. Build nest on ground |
| Genus | Species |
|-----------------------------|---------------------------|
| Aythya (Diving ducks) | Hawser, Common Pochard |
| Lophonetta (Pintail group) | Fulvous Whistling Duck |
| Oxyura (Benthic ducks) | Wigee, Banded Water-screw |
| Action | Action |
|----------|---------------------------------------------------------------------------------------------------------|
| **Swim** | Gracefully glide on H 2 O surfaces. |
| *Fly* | |
| Quack | | Type | Sound | |--------|--------------| | Short | "quak" | | Long | "quaaaaaack" | |
| Name | Description | Image |
|-------------------|----------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Donald Duck | Cartoon character. | [View PNG](https://en.wikipedia.org/wiki/Donald_Duck#/media/File:Donald_Duck_angry_transparent_background.png) |
| White-headed duck | A small diving duck some 45 cm (18 in) long. | White-headed duck thumbnail <!-- image --> |
| Mandarin Duck | Known for its striking plumage. | [View Full-Size Image](https://upload.wikimedia.org/wikipedia/commons/thumb/7/75/Mandarin_duck_%28Aix_galericulata%29.jpg/250px-Mandarin_duck_%28Aix_galericulata%29.jpg) |
| Unknown Duck | No photo available. | |
<!-- image -->

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,167 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Rich Table Cells in HTML</title>
<style>
table { border-collapse: collapse; width: 90%; margin: 1em auto; }
th, td { border: 1px solid #aaa; padding: 0.5rem; text-align: left; vertical-align: top; }
th { background:#f2f2f2; }
</style>
</head>
<body>
<h1>Rich Table Cells in HTML</h1>
<!-- Simple data table -->
<table>
<caption>Basic duck facts</caption>
<thead>
<tr><th>Name</th><th>Habitat</th><th>Comment</th></tr>
</thead>
<tbody>
<!-- empty cell -->
<tr><td>Wood Duck</td><td>&nbsp;</td><td>Often seen near ponds.</td></tr>
<!-- plain text -->
<tr><td>Mallard</td><td>Ponds, lakes, rivers</td><td>Quack</td></tr>
<!-- formatted text -->
<tr>
<td>Goose (not a duck!)</td>
<td style="color:#777;">Water & wetlands</td>
<td><strong>Large</strong>, <em>loud</em>, <u>noisy</u>, <s>small</s></td>
</tr>
<!-- list -->
<tr>
<td>Teal</td>
<td>
<ul style="margin:0;padding-left:1.2rem;">
<li>Pond</li>
<li>Marsh</li>
<li>Riverbank</li>
</ul>
</td>
<td>
<ol style="margin:0;padding-left:1.2rem;">
<li>Fly south in winter</li>
<li>Build nest on ground</li>
</ol>
</td>
</tr>
</tbody>
</table>
<!-- Table with mixed cell content -->
<table>
<caption>Duck family tree (simplified)</caption>
<thead>
<tr><th>Genus</th><th>Species</th></tr>
</thead>
<tbody>
<tr>
<td>Aythya<br><small>(Diving ducks)</small></td>
<td>Hawser, Common Pochard</td>
</tr>
<tr>
<td>Lophonetta<br><small>(Pintail group)</small></td>
<td>Fulvous Whistling Duck</td>
</tr>
<tr>
<td>Oxyura<br><small>(Benthic ducks)</small></td>
<td>Wigee, Banded Waterscrew</td>
</tr>
</tbody>
</table>
<!-- Table with a mix of cell types and a nested table -->
<table>
<caption>Duckrelated actions</caption>
<thead>
<tr style="background:#cce5ff;">
<th colspan="2">Action</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>Swim</strong></td>
<td>Gracefully glide on H<sub>2</sub>O surfaces.</td>
</tr>
<tr>
<td><em>Fly</em></td>
<td>&nbsp;</td> <!-- empty cell -->
</tr>
<tr>
<td><u>Quack</u></td>
<td>
<table>
<thead>
<tr><th>Type</th><th>Sound</th></tr>
</thead>
<tbody>
<tr>
<td>Short</td>
<td>“quak”</td>
</tr>
<tr>
<td>Long</td>
<td>“quaaaaaack”</td>
</tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
<!-- Table with links -->
<table>
<caption>Famous Ducks with Images</caption>
<thead>
<tr><th>Name</th><th>Description</th><th>Image</th></tr>
</thead>
<tbody>
<!-- Plain link to a PNG/JPG file -->
<tr>
<td>Donald Duck</td>
<td>Cartoon character.</td>
<td><a href="https://en.wikipedia.org/wiki/Donald_Duck#/media/File:Donald_Duck_angry_transparent_background.png" target="_blank">View PNG</a></td>
</tr>
<!-- Thumbnail image that opens in a new tab -->
<tr>
<td>White-headed duck</td>
<td>A small diving duck some 45 cm (18 in) long.</td>
<td>
<a href="https://upload.wikimedia.org/wikipedia/commons/thumb/a/ab/Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif/lossy-page1-1920px-Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif.jpg" target="_blank">
<img class="thumb"
src="https://upload.wikimedia.org/wikipedia/commons/thumb/a/ab/Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif/lossy-page1-250px-Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif.jpg"
alt="White-headed duck thumbnail">
</a>
</td>
</tr>
<!-- Link to a larger image with a caption -->
<tr>
<td>Mandarin Duck</td>
<td>Known for its striking plumage.</td>
<td>
<a href="https://upload.wikimedia.org/wikipedia/commons/thumb/7/75/Mandarin_duck_%28Aix_galericulata%29.jpg/250px-Mandarin_duck_%28Aix_galericulata%29.jpg" target="_blank">
View FullSize Image
</a>
</td>
</tr>
<!-- Empty image cell (to illustrate the empty case) -->
<tr>
<td>Unknown Duck</td>
<td>No photo available.</td>
<td>&nbsp;</td>
</tr>
</tbody>
</table>
</body>
</html>

View File

@@ -205,12 +205,14 @@ def test_extract_parent_hyperlinks():
assert str(annotated_text_list[0].hyperlink) == a_tag.get("href")
def get_html_paths():
@pytest.fixture(scope="module")
def html_paths() -> list[Path]:
# Define the directory you want to search
directory = Path("./tests/data/html/")
# List all HTML files in the directory and its subdirectories
html_files = sorted(directory.rglob("*.html"))
return html_files
@@ -220,8 +222,7 @@ def get_converter():
return converter
def test_e2e_html_conversions():
html_paths = get_html_paths()
def test_e2e_html_conversions(html_paths):
converter = get_converter()
for html_path in html_paths:
@@ -441,3 +442,84 @@ def test_fetch_remote_images(monkeypatch):
"tests/data/html/example_image_01.png", "rb"
)
assert res.document
def test_is_rich_table_cell(html_paths):
"""Test the function is_rich_table_cell."""
name = "html_rich_table_cells.html"
path = next(item for item in html_paths if item.name == name)
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename=name,
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=path,
)
gt_cells: dict[int, list[bool]] = {}
# table: Basic duck facts
gt_cells[0] = [
False,
False,
False,
False,
False,
False,
False,
False,
False,
False,
False,
True,
False,
True,
True,
]
# table: Duck family tree
gt_cells[1] = [False, False, True, False, True, False, True, False]
# table: Duck-related actions
gt_cells[2] = [False, True, True, True, False, True, True]
# table: nested table
gt_cells[3] = [False, False, False, False, False, False]
# table: Famous Ducks with Images
gt_cells[4] = [
False,
False,
False,
False,
False,
True,
False,
False,
True,
False,
False,
True,
False,
False,
False,
]
for idx_t, table in enumerate(backend.soup.find_all("table")):
gt_it = iter(gt_cells[idx_t])
num_cells = 0
containers = table.find_all(["thead", "tbody"], recursive=False)
for part in containers:
for idx_r, row in enumerate(part.find_all("tr", recursive=False)):
cells = row.find_all(["td", "th"], recursive=False)
if not cells:
continue
for idx_c, cell in enumerate(cells):
assert next(gt_it) == backend._is_rich_table_cell(cell), (
f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
f"with text: {cell.text}"
)
num_cells += 1
assert num_cells == len(gt_cells[idx_t]), (
f"Cell number does not match in table {idx_t}"
)

1454
uv.lock generated

File diff suppressed because it is too large Load Diff