mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
fix: fix HTML table parser and JATS backend bugs (#1948)
Fix a bug in parsing HTML tables in HTML backend. Fix a bug in test file that prevented JATS backend tests. Ensure that the JATS backend creates headings with the right level. Remove unnecessary data files for testing JATS backend. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
d6d2dbe2f9
commit
e1e3053695
@@ -379,6 +379,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
_log.debug(f"list-item has no text: {element}")
|
||||
|
||||
@staticmethod
|
||||
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
||||
"""Extract colspan and rowspan values from a table cell tag.
|
||||
|
||||
This function retrieves the 'colspan' and 'rowspan' attributes from a given
|
||||
table cell tag.
|
||||
If the attribute does not exist or it is not numeric, it defaults to 1.
|
||||
"""
|
||||
raw_spans: tuple[str, str] = (
|
||||
str(cell.get("colspan", "1")),
|
||||
str(cell.get("rowspan", "1")),
|
||||
)
|
||||
int_spans: tuple[int, int] = (
|
||||
int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
|
||||
int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
|
||||
)
|
||||
|
||||
return int_spans
|
||||
|
||||
@staticmethod
|
||||
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
||||
nested_tables = element.find("table")
|
||||
@@ -398,10 +417,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
cell_tag = cast(Tag, cell)
|
||||
val = cell_tag.get("colspan", "1")
|
||||
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
||||
col_count += colspan
|
||||
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
|
||||
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
||||
col_count += col_span
|
||||
if cell_tag.name == "td" or row_span == 1:
|
||||
is_row_header = False
|
||||
num_cols = max(num_cols, col_count)
|
||||
if not is_row_header:
|
||||
@@ -428,10 +446,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
row_header = True
|
||||
for html_cell in cells:
|
||||
if isinstance(html_cell, Tag):
|
||||
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
||||
if html_cell.name == "td":
|
||||
col_header = False
|
||||
row_header = False
|
||||
elif html_cell.get("rowspan") is None:
|
||||
elif row_span == 1:
|
||||
row_header = False
|
||||
if not row_header:
|
||||
row_idx += 1
|
||||
@@ -456,18 +475,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
text = html_cell.text
|
||||
|
||||
# label = html_cell.name
|
||||
col_val = html_cell.get("colspan", "1")
|
||||
col_span = (
|
||||
int(col_val)
|
||||
if isinstance(col_val, str) and col_val.isnumeric()
|
||||
else 1
|
||||
)
|
||||
row_val = html_cell.get("rowspan", "1")
|
||||
row_span = (
|
||||
int(row_val)
|
||||
if isinstance(row_val, str) and row_val.isnumeric()
|
||||
else 1
|
||||
)
|
||||
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
||||
if row_header:
|
||||
row_span -= 1
|
||||
while (
|
||||
|
||||
@@ -93,8 +93,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# Initialize the root of the document hierarchy
|
||||
self.root: Optional[NodeItem] = None
|
||||
|
||||
self.valid = False
|
||||
self.hlevel: int = 0
|
||||
self.valid: bool = False
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.seek(0)
|
||||
@@ -147,6 +147,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
self.hlevel = 0
|
||||
|
||||
# Get metadata XML components
|
||||
xml_components: XMLComponents = self._parse_metadata()
|
||||
@@ -304,7 +305,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
|
||||
if not text:
|
||||
continue
|
||||
parent = doc.add_heading(parent=self.root, text=title)
|
||||
parent = doc.add_heading(
|
||||
parent=self.root, text=title, level=self.hlevel + 1
|
||||
)
|
||||
doc.add_text(
|
||||
parent=parent,
|
||||
text=text,
|
||||
@@ -637,7 +640,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif child.tag == "ack":
|
||||
text = DEFAULT_HEADER_ACKNOWLEDGMENTS
|
||||
if text:
|
||||
new_parent = doc.add_heading(text=text, parent=parent)
|
||||
self.hlevel += 1
|
||||
new_parent = doc.add_heading(
|
||||
text=text, parent=parent, level=self.hlevel
|
||||
)
|
||||
elif child.tag == "list":
|
||||
new_parent = doc.add_group(
|
||||
label=GroupLabel.LIST, name="list", parent=parent
|
||||
@@ -694,6 +700,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
new_text = self._walk_linear(doc, new_parent, child)
|
||||
if not (node.getparent().tag == "p" and node.tag in flush_tags):
|
||||
node_text += new_text
|
||||
if child.tag in ("sec", "ack") and text:
|
||||
self.hlevel -= 1
|
||||
|
||||
# pick up the tail text
|
||||
node_text += child.tail.replace("\n", " ") if child.tail else ""
|
||||
|
||||
Reference in New Issue
Block a user