mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix(html): slow table parsing (#2582)
* fix(html): simplify parsing of simple table cells Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(html): add test for rich table cells Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(html): ensure table cells with formatted text are parsed as RichTableCell Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): simplify process_rich_table_cells since only rich cells are processed Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(html): formatted cell runs should be parsed as text items respecting the order Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: pin latest docling-core and update uv.lock Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: upgrade dependencies on uv.lock Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
8da3d287ed
commit
0ba8d5d9e3
@@ -354,32 +354,51 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
) -> tuple[bool, Union[RefItem, None]]:
|
||||
rich_table_cell = False
|
||||
ref_for_rich_cell = None
|
||||
if len(provs_in_cell) > 0:
|
||||
ref_for_rich_cell = provs_in_cell[0]
|
||||
if len(provs_in_cell) > 1:
|
||||
# Cell has multiple elements, we need to group them
|
||||
if len(provs_in_cell) >= 1:
|
||||
# Cell rich cell has multiple elements, we need to group them
|
||||
rich_table_cell = True
|
||||
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
|
||||
group_name, doc, provs_in_cell, docling_table
|
||||
)
|
||||
elif len(provs_in_cell) == 1:
|
||||
item_ref = provs_in_cell[0]
|
||||
pr_item = item_ref.resolve(doc)
|
||||
if isinstance(pr_item, TextItem):
|
||||
# Cell has only one element and it's just a text
|
||||
rich_table_cell = False
|
||||
try:
|
||||
doc.delete_items(node_items=[pr_item])
|
||||
except Exception as e:
|
||||
_log.error(f"Error while making rich table: {e}.")
|
||||
else:
|
||||
rich_table_cell = True
|
||||
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
|
||||
group_name, doc, provs_in_cell, docling_table
|
||||
)
|
||||
|
||||
return rich_table_cell, ref_for_rich_cell
|
||||
|
||||
def _is_rich_table_cell(self, table_cell: Tag) -> bool:
|
||||
"""Determine whether an table cell should be parsed as a Docling RichTableCell.
|
||||
|
||||
A table cell can hold rich content and be parsed with a Docling RichTableCell.
|
||||
However, this requires walking through the content elements and creating
|
||||
Docling node items. If the cell holds only plain text, the parsing is simpler
|
||||
and using a TableCell is prefered.
|
||||
|
||||
Args:
|
||||
table_cell: The HTML tag representing a table cell.
|
||||
|
||||
Returns:
|
||||
Whether the cell should be parsed as RichTableCell.
|
||||
"""
|
||||
is_rich: bool = True
|
||||
|
||||
children = table_cell.find_all(recursive=True) # all descendants of type Tag
|
||||
if not children:
|
||||
content = [
|
||||
item
|
||||
for item in table_cell.contents
|
||||
if isinstance(item, NavigableString)
|
||||
]
|
||||
is_rich = len(content) > 1
|
||||
else:
|
||||
annotations = self._extract_text_and_hyperlink_recursively(
|
||||
table_cell, find_parent_annotation=True
|
||||
)
|
||||
if not annotations:
|
||||
is_rich = bool(item for item in children if item.name == "img")
|
||||
elif len(annotations) == 1:
|
||||
anno: AnnotatedText = annotations[0]
|
||||
is_rich = bool(anno.formatting) or bool(anno.hyperlink) or anno.code
|
||||
|
||||
return is_rich
|
||||
|
||||
def parse_table_data(
|
||||
self,
|
||||
element: Tag,
|
||||
@@ -437,23 +456,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
formula.replace_with(NavigableString(math_formula))
|
||||
|
||||
provs_in_cell: list[RefItem] = []
|
||||
# Parse table cell sub-tree for Rich Cells content:
|
||||
table_level = self.level
|
||||
provs_in_cell = self._walk(html_cell, doc)
|
||||
# After walking sub-tree in cell, restore previously set level
|
||||
self.level = table_level
|
||||
rich_table_cell = self._is_rich_table_cell(html_cell)
|
||||
if rich_table_cell:
|
||||
# Parse table cell sub-tree for Rich Cells content:
|
||||
table_level = self.level
|
||||
provs_in_cell = self._walk(html_cell, doc)
|
||||
# After walking sub-tree in cell, restore previously set level
|
||||
self.level = table_level
|
||||
|
||||
rich_table_cell = False
|
||||
ref_for_rich_cell = None
|
||||
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
|
||||
rich_table_cell, ref_for_rich_cell = (
|
||||
HTMLDocumentBackend.process_rich_table_cells(
|
||||
provs_in_cell, group_name, doc, docling_table
|
||||
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
|
||||
rich_table_cell, ref_for_rich_cell = (
|
||||
HTMLDocumentBackend.process_rich_table_cells(
|
||||
provs_in_cell, group_name, doc, docling_table
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Extracting text
|
||||
text = self.get_text(html_cell).strip()
|
||||
text = HTMLDocumentBackend._clean_unicode(
|
||||
self.get_text(html_cell).strip()
|
||||
)
|
||||
col_span, row_span = self._get_cell_spans(html_cell)
|
||||
if row_header:
|
||||
row_span -= 1
|
||||
@@ -555,6 +576,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if im_ref3:
|
||||
added_refs.append(im_ref3)
|
||||
elif name in _FORMAT_TAG_MAP:
|
||||
flush_buffer()
|
||||
with self._use_format([name]):
|
||||
wk = self._walk(node, doc)
|
||||
added_refs.extend(wk)
|
||||
|
||||
@@ -45,7 +45,7 @@ authors = [
|
||||
requires-python = '>=3.9,<4.0'
|
||||
dependencies = [
|
||||
'pydantic (>=2.0.0,<3.0.0)',
|
||||
'docling-core[chunking] (>=2.48.2,<3.0.0)',
|
||||
'docling-core[chunking] (>=2.50.1,<3.0.0)',
|
||||
'docling-parse (>=4.7.0,<5.0.0)',
|
||||
"docling-ibm-models>=3.9.1,<4",
|
||||
'filetype (>=1.2.0,<2.0.0)',
|
||||
|
||||
53
tests/data/groundtruth/docling_v2/html_rich_table_cells.html.itxt
vendored
Normal file
53
tests/data/groundtruth/docling_v2/html_rich_table_cells.html.itxt
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Rich Table Cells in HTML
|
||||
item-2 at level 2: table with [5x3]
|
||||
item-3 at level 3: unspecified: group rich_cell_group_1_1_3
|
||||
item-4 at level 4: text: Large
|
||||
item-5 at level 4: text: ,
|
||||
item-6 at level 4: text: loud
|
||||
item-7 at level 4: text: ,
|
||||
item-8 at level 4: text: noisy
|
||||
item-9 at level 4: text: ,
|
||||
item-10 at level 4: text: small
|
||||
item-11 at level 3: unspecified: group rich_cell_group_1_0_4
|
||||
item-12 at level 4: list: group list
|
||||
item-13 at level 5: list_item: Pond
|
||||
item-14 at level 5: list_item: Marsh
|
||||
item-15 at level 5: list_item: Riverbank
|
||||
item-16 at level 3: unspecified: group rich_cell_group_1_1_4
|
||||
item-17 at level 4: list: group ordered list
|
||||
item-18 at level 5: list_item: Fly south in winter
|
||||
item-19 at level 5: list_item: Build nest on ground
|
||||
item-20 at level 2: table with [4x2]
|
||||
item-21 at level 3: unspecified: group rich_cell_group_2_0_1
|
||||
item-22 at level 4: text: Aythya
|
||||
item-23 at level 4: text: (Diving ducks)
|
||||
item-24 at level 3: unspecified: group rich_cell_group_2_0_2
|
||||
item-25 at level 4: text: Lophonetta
|
||||
item-26 at level 4: text: (Pintail group)
|
||||
item-27 at level 3: unspecified: group rich_cell_group_2_0_3
|
||||
item-28 at level 4: text: Oxyura
|
||||
item-29 at level 4: text: (Benthic ducks)
|
||||
item-30 at level 2: table with [4x2]
|
||||
item-31 at level 3: unspecified: group rich_cell_group_3_0_1
|
||||
item-32 at level 4: text: Swim
|
||||
item-33 at level 3: unspecified: group rich_cell_group_3_0_1
|
||||
item-34 at level 4: text: Gracefully glide on H
|
||||
item-35 at level 4: text: 2
|
||||
item-36 at level 4: text: O surfaces.
|
||||
item-37 at level 3: unspecified: group rich_cell_group_3_0_2
|
||||
item-38 at level 4: text: Fly
|
||||
item-39 at level 3: unspecified: group rich_cell_group_3_0_3
|
||||
item-40 at level 4: text: Quack
|
||||
item-41 at level 3: unspecified: group rich_cell_group_4_0_3
|
||||
item-42 at level 4: table with [3x2]
|
||||
item-43 at level 2: table with [5x3]
|
||||
item-44 at level 3: unspecified: group rich_cell_group_5_1_1
|
||||
item-45 at level 4: text: View PNG
|
||||
item-46 at level 3: unspecified: group rich_cell_group_5_1_2
|
||||
item-47 at level 4: picture
|
||||
item-47 at level 5: caption: White-headed duck thumbnail
|
||||
item-48 at level 3: unspecified: group rich_cell_group_5_1_3
|
||||
item-49 at level 4: text: View Full-Size Image
|
||||
item-50 at level 2: picture
|
||||
item-51 at level 1: caption: White-headed duck thumbnail
|
||||
2355
tests/data/groundtruth/docling_v2/html_rich_table_cells.html.json
vendored
Normal file
2355
tests/data/groundtruth/docling_v2/html_rich_table_cells.html.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
29
tests/data/groundtruth/docling_v2/html_rich_table_cells.html.md
vendored
Normal file
29
tests/data/groundtruth/docling_v2/html_rich_table_cells.html.md
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
# Rich Table Cells in HTML
|
||||
|
||||
| Name | Habitat | Comment |
|
||||
|---------------------|----------------------------|------------------------------------------------|
|
||||
| Wood Duck | | Often seen near ponds. |
|
||||
| Mallard | Ponds, lakes, rivers | Quack |
|
||||
| Goose (not a duck!) | Water & wetlands | **Large** , *loud* , noisy , ~~small~~ |
|
||||
| Teal | - Pond - Marsh - Riverbank | 1. Fly south in winter 2. Build nest on ground |
|
||||
|
||||
| Genus | Species |
|
||||
|-----------------------------|---------------------------|
|
||||
| Aythya (Diving ducks) | Hawser, Common Pochard |
|
||||
| Lophonetta (Pintail group) | Fulvous Whistling Duck |
|
||||
| Oxyura (Benthic ducks) | Wigee, Banded Water-screw |
|
||||
|
||||
| Action | Action |
|
||||
|----------|---------------------------------------------------------------------------------------------------------|
|
||||
| **Swim** | Gracefully glide on H 2 O surfaces. |
|
||||
| *Fly* | |
|
||||
| Quack | | Type | Sound | |--------|--------------| | Short | "quak" | | Long | "quaaaaaack" | |
|
||||
|
||||
| Name | Description | Image |
|
||||
|-------------------|----------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| Donald Duck | Cartoon character. | [View PNG](https://en.wikipedia.org/wiki/Donald_Duck#/media/File:Donald_Duck_angry_transparent_background.png) |
|
||||
| White-headed duck | A small diving duck some 45 cm (18 in) long. | White-headed duck thumbnail <!-- image --> |
|
||||
| Mandarin Duck | Known for its striking plumage. | [View Full-Size Image](https://upload.wikimedia.org/wikipedia/commons/thumb/7/75/Mandarin_duck_%28Aix_galericulata%29.jpg/250px-Mandarin_duck_%28Aix_galericulata%29.jpg) |
|
||||
| Unknown Duck | No photo available. | |
|
||||
|
||||
<!-- image -->
|
||||
2401
tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
vendored
2401
tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
vendored
File diff suppressed because it is too large
Load Diff
8294
tests/data/groundtruth/docling_v2/wiki_duck.html.json
vendored
8294
tests/data/groundtruth/docling_v2/wiki_duck.html.json
vendored
File diff suppressed because it is too large
Load Diff
1118
tests/data/groundtruth/docling_v2/wiki_duck.html.md
vendored
1118
tests/data/groundtruth/docling_v2/wiki_duck.html.md
vendored
File diff suppressed because it is too large
Load Diff
167
tests/data/html/html_rich_table_cells.html
vendored
Normal file
167
tests/data/html/html_rich_table_cells.html
vendored
Normal file
@@ -0,0 +1,167 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Rich Table Cells in HTML</title>
|
||||
<style>
|
||||
table { border-collapse: collapse; width: 90%; margin: 1em auto; }
|
||||
th, td { border: 1px solid #aaa; padding: 0.5rem; text-align: left; vertical-align: top; }
|
||||
th { background:#f2f2f2; }
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<h1>Rich Table Cells in HTML</h1>
|
||||
|
||||
<!-- Simple data table -->
|
||||
<table>
|
||||
<caption>Basic duck facts</caption>
|
||||
<thead>
|
||||
<tr><th>Name</th><th>Habitat</th><th>Comment</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<!-- empty cell -->
|
||||
<tr><td>Wood Duck</td><td> </td><td>Often seen near ponds.</td></tr>
|
||||
|
||||
<!-- plain text -->
|
||||
<tr><td>Mallard</td><td>Ponds, lakes, rivers</td><td>Quack</td></tr>
|
||||
|
||||
<!-- formatted text -->
|
||||
<tr>
|
||||
<td>Goose (not a duck!)</td>
|
||||
<td style="color:#777;">Water & wetlands</td>
|
||||
<td><strong>Large</strong>, <em>loud</em>, <u>noisy</u>, <s>small</s></td>
|
||||
</tr>
|
||||
|
||||
<!-- list -->
|
||||
<tr>
|
||||
<td>Teal</td>
|
||||
<td>
|
||||
<ul style="margin:0;padding-left:1.2rem;">
|
||||
<li>Pond</li>
|
||||
<li>Marsh</li>
|
||||
<li>Riverbank</li>
|
||||
</ul>
|
||||
</td>
|
||||
<td>
|
||||
<ol style="margin:0;padding-left:1.2rem;">
|
||||
<li>Fly south in winter</li>
|
||||
<li>Build nest on ground</li>
|
||||
</ol>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<!-- Table with mixed cell content -->
|
||||
<table>
|
||||
<caption>Duck family tree (simplified)</caption>
|
||||
<thead>
|
||||
<tr><th>Genus</th><th>Species</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Aythya<br><small>(Diving ducks)</small></td>
|
||||
<td>Hawser, Common Pochard</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Lophonetta<br><small>(Pintail group)</small></td>
|
||||
<td>Fulvous Whistling Duck</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Oxyura<br><small>(Benthic ducks)</small></td>
|
||||
<td>Wigee, Banded Water‑screw</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<!-- Table with a mix of cell types and a nested table -->
|
||||
<table>
|
||||
<caption>Duck‑related actions</caption>
|
||||
<thead>
|
||||
<tr style="background:#cce5ff;">
|
||||
<th colspan="2">Action</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><strong>Swim</strong></td>
|
||||
<td>Gracefully glide on H<sub>2</sub>O surfaces.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><em>Fly</em></td>
|
||||
<td> </td> <!-- empty cell -->
|
||||
</tr>
|
||||
<tr>
|
||||
<td><u>Quack</u></td>
|
||||
<td>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Type</th><th>Sound</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Short</td>
|
||||
<td>“quak”</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Long</td>
|
||||
<td>“quaaaaaack”</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<!-- Table with links -->
|
||||
<table>
|
||||
<caption>Famous Ducks with Images</caption>
|
||||
<thead>
|
||||
<tr><th>Name</th><th>Description</th><th>Image</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<!-- Plain link to a PNG/JPG file -->
|
||||
<tr>
|
||||
<td>Donald Duck</td>
|
||||
<td>Cartoon character.</td>
|
||||
<td><a href="https://en.wikipedia.org/wiki/Donald_Duck#/media/File:Donald_Duck_angry_transparent_background.png" target="_blank">View PNG</a></td>
|
||||
</tr>
|
||||
|
||||
<!-- Thumbnail image that opens in a new tab -->
|
||||
<tr>
|
||||
<td>White-headed duck</td>
|
||||
<td>A small diving duck some 45 cm (18 in) long.</td>
|
||||
<td>
|
||||
<a href="https://upload.wikimedia.org/wikipedia/commons/thumb/a/ab/Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif/lossy-page1-1920px-Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif.jpg" target="_blank">
|
||||
<img class="thumb"
|
||||
src="https://upload.wikimedia.org/wikipedia/commons/thumb/a/ab/Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif/lossy-page1-250px-Witkopeend_-_white-headed_duck_-_Oxyura_leucocephala_3.tif.jpg"
|
||||
alt="White-headed duck thumbnail">
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Link to a larger image with a caption -->
|
||||
<tr>
|
||||
<td>Mandarin Duck</td>
|
||||
<td>Known for its striking plumage.</td>
|
||||
<td>
|
||||
<a href="https://upload.wikimedia.org/wikipedia/commons/thumb/7/75/Mandarin_duck_%28Aix_galericulata%29.jpg/250px-Mandarin_duck_%28Aix_galericulata%29.jpg" target="_blank">
|
||||
View Full‑Size Image
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Empty image cell (to illustrate the empty case) -->
|
||||
<tr>
|
||||
<td>Unknown Duck</td>
|
||||
<td>No photo available.</td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -205,12 +205,14 @@ def test_extract_parent_hyperlinks():
|
||||
assert str(annotated_text_list[0].hyperlink) == a_tag.get("href")
|
||||
|
||||
|
||||
def get_html_paths():
|
||||
@pytest.fixture(scope="module")
|
||||
def html_paths() -> list[Path]:
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/data/html/")
|
||||
|
||||
# List all HTML files in the directory and its subdirectories
|
||||
html_files = sorted(directory.rglob("*.html"))
|
||||
|
||||
return html_files
|
||||
|
||||
|
||||
@@ -220,8 +222,7 @@ def get_converter():
|
||||
return converter
|
||||
|
||||
|
||||
def test_e2e_html_conversions():
|
||||
html_paths = get_html_paths()
|
||||
def test_e2e_html_conversions(html_paths):
|
||||
converter = get_converter()
|
||||
|
||||
for html_path in html_paths:
|
||||
@@ -441,3 +442,84 @@ def test_fetch_remote_images(monkeypatch):
|
||||
"tests/data/html/example_image_01.png", "rb"
|
||||
)
|
||||
assert res.document
|
||||
|
||||
|
||||
def test_is_rich_table_cell(html_paths):
|
||||
"""Test the function is_rich_table_cell."""
|
||||
|
||||
name = "html_rich_table_cells.html"
|
||||
path = next(item for item in html_paths if item.name == name)
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path,
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
filename=name,
|
||||
)
|
||||
backend = HTMLDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=path,
|
||||
)
|
||||
|
||||
gt_cells: dict[int, list[bool]] = {}
|
||||
# table: Basic duck facts
|
||||
gt_cells[0] = [
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
]
|
||||
# table: Duck family tree
|
||||
gt_cells[1] = [False, False, True, False, True, False, True, False]
|
||||
# table: Duck-related actions
|
||||
gt_cells[2] = [False, True, True, True, False, True, True]
|
||||
# table: nested table
|
||||
gt_cells[3] = [False, False, False, False, False, False]
|
||||
# table: Famous Ducks with Images
|
||||
gt_cells[4] = [
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
]
|
||||
|
||||
for idx_t, table in enumerate(backend.soup.find_all("table")):
|
||||
gt_it = iter(gt_cells[idx_t])
|
||||
num_cells = 0
|
||||
containers = table.find_all(["thead", "tbody"], recursive=False)
|
||||
for part in containers:
|
||||
for idx_r, row in enumerate(part.find_all("tr", recursive=False)):
|
||||
cells = row.find_all(["td", "th"], recursive=False)
|
||||
if not cells:
|
||||
continue
|
||||
for idx_c, cell in enumerate(cells):
|
||||
assert next(gt_it) == backend._is_rich_table_cell(cell), (
|
||||
f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
|
||||
f"with text: {cell.text}"
|
||||
)
|
||||
num_cells += 1
|
||||
assert num_cells == len(gt_cells[idx_t]), (
|
||||
f"Cell number does not match in table {idx_t}"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user