fix(HTML): concatenation of child strings in table cells and list items (#1981)

fix(HTML): ensure correct concatenation of child strings in table cells and list items

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-07-24 11:19:25 +02:00
committed by GitHub
parent 7b5f86098d
commit 5132f061a8
5 changed files with 162 additions and 134 deletions

View File

@@ -5,7 +5,7 @@ from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union, cast
from bs4 import BeautifulSoup, NavigableString, Tag
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from bs4.element import PreformattedString
from docling_core.types.doc import (
DocItem,
@@ -297,7 +297,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
):
parts.append(child)
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
text_part = child.get_text()
text_part = HTMLDocumentBackend.get_text(child)
if text_part:
parts.append(text_part)
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
@@ -417,6 +417,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
content_layer=self.content_layer,
)
@staticmethod
def get_text(item: PageElement) -> str:
"""Concatenate all child strings of a PageElement.
This method is equivalent to `PageElement.get_text()` but also considers
certain tags. When called on a <p> or <li> tags, it returns the text with a
trailing space, otherwise the text is concatenated without separators.
"""
def _extract_text_recursively(item: PageElement) -> list[str]:
"""Recursively extract text from all child nodes."""
result: list[str] = []
if isinstance(item, NavigableString):
result = [item]
elif isinstance(item, Tag):
tag = cast(Tag, item)
parts: list[str] = []
for child in tag:
parts.extend(_extract_text_recursively(child))
result.append(
"".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
)
return result
parts: list[str] = _extract_text_recursively(item)
return "".join(parts)
@staticmethod
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
"""Extract colspan and rowspan values from a table cell tag.
@@ -510,9 +540,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
formula.replace_with(NavigableString(math_formula))
# TODO: extract content correctly from table-cells with lists
text = html_cell.text
# label = html_cell.name
text = HTMLDocumentBackend.get_text(html_cell).strip()
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
if row_header:
row_span -= 1