fix: fix HTML table parser and JATS backend bugs (#1948)

Fix a bug in parsing HTML tables in HTML backend.
Fix a bug in test file that prevented JATS backend tests.
Ensure that the JATS backend creates headings with the right level.
Remove unnecessary data files for testing JATS backend.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-07-16 10:49:24 +02:00
committed by GitHub
parent d6d2dbe2f9
commit e1e3053695
27 changed files with 29206 additions and 14362 deletions

View File

@@ -379,6 +379,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
else:
_log.debug(f"list-item has no text: {element}")
@staticmethod
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
"""Extract colspan and rowspan values from a table cell tag.
This function retrieves the 'colspan' and 'rowspan' attributes from a given
table cell tag.
If the attribute does not exist or it is not numeric, it defaults to 1.
"""
raw_spans: tuple[str, str] = (
str(cell.get("colspan", "1")),
str(cell.get("rowspan", "1")),
)
int_spans: tuple[int, int] = (
int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
)
return int_spans
@staticmethod
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
nested_tables = element.find("table")
@@ -398,10 +417,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if not isinstance(row, Tag):
continue
cell_tag = cast(Tag, cell)
val = cell_tag.get("colspan", "1")
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
col_count += colspan
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
col_count += col_span
if cell_tag.name == "td" or row_span == 1:
is_row_header = False
num_cols = max(num_cols, col_count)
if not is_row_header:
@@ -428,10 +446,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
row_header = True
for html_cell in cells:
if isinstance(html_cell, Tag):
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
if html_cell.name == "td":
col_header = False
row_header = False
elif html_cell.get("rowspan") is None:
elif row_span == 1:
row_header = False
if not row_header:
row_idx += 1
@@ -456,18 +475,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text = html_cell.text
# label = html_cell.name
col_val = html_cell.get("colspan", "1")
col_span = (
int(col_val)
if isinstance(col_val, str) and col_val.isnumeric()
else 1
)
row_val = html_cell.get("rowspan", "1")
row_span = (
int(row_val)
if isinstance(row_val, str) and row_val.isnumeric()
else 1
)
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
if row_header:
row_span -= 1
while (

View File

@@ -93,8 +93,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
# Initialize the root of the document hierarchy
self.root: Optional[NodeItem] = None
self.valid = False
self.hlevel: int = 0
self.valid: bool = False
try:
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.seek(0)
@@ -147,6 +147,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
self.hlevel = 0
# Get metadata XML components
xml_components: XMLComponents = self._parse_metadata()
@@ -304,7 +305,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
if not text:
continue
parent = doc.add_heading(parent=self.root, text=title)
parent = doc.add_heading(
parent=self.root, text=title, level=self.hlevel + 1
)
doc.add_text(
parent=parent,
text=text,
@@ -637,7 +640,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
elif child.tag == "ack":
text = DEFAULT_HEADER_ACKNOWLEDGMENTS
if text:
new_parent = doc.add_heading(text=text, parent=parent)
self.hlevel += 1
new_parent = doc.add_heading(
text=text, parent=parent, level=self.hlevel
)
elif child.tag == "list":
new_parent = doc.add_group(
label=GroupLabel.LIST, name="list", parent=parent
@@ -694,6 +700,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
new_text = self._walk_linear(doc, new_parent, child)
if not (node.getparent().tag == "p" and node.tag in flush_tags):
node_text += new_text
if child.tag in ("sec", "ack") and text:
self.hlevel -= 1
# pick up the tail text
node_text += child.tail.replace("\n", " ") if child.tail else ""