fix(html): handle address, details, and summary tags (#1436)

* fix(html): handle 'address' tag

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* fix(html): handle 'details' tag

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-04-23 09:30:59 +02:00
committed by GitHub
parent c2470ed216
commit ed20124544
5 changed files with 82 additions and 6 deletions

View File

@@ -26,6 +26,8 @@ _log = logging.getLogger(__name__)
# tags that generate NodeItem elements
TAGS_FOR_NODE_ITEMS: Final = [
"address",
"details",
"h1",
"h2",
"h3",
@@ -38,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
"ul",
"ol",
"li",
"summary",
"table",
"figure",
"img",
@@ -163,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(tag, doc)
elif tag.name in ["p"]:
elif tag.name in ["p", "address", "summary"]:
self.handle_paragraph(tag, doc)
elif tag.name in ["pre", "code"]:
self.handle_code(tag, doc)
@@ -177,6 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.handle_figure(tag, doc)
elif tag.name == "img":
self.handle_image(tag, doc)
elif tag.name == "details":
self.handle_details(tag, doc)
else:
self.walk(tag, doc)
@@ -201,6 +206,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return ["".join(result) + " "]
def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
"""Handle details tag (details) and its content."""
self.parents[self.level + 1] = doc.add_group(
name="details",
label=GroupLabel.SECTION,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
self.level += 1
self.walk(element, doc)
self.parents[self.level + 1] = None
self.level -= 1
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles header tags (h1, h2, etc.)."""
hlevel = int(element.name.replace("h", ""))
@@ -258,7 +278,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
)
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p)."""
"""Handles paragraph tags (p) or equivalent ones."""
if element.text is None:
return
text = element.text.strip()