fix(html): handle 'address' tag

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-04-22 14:41:33 +02:00
parent c2470ed216
commit 78f56c1e38

View File

@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
# tags that generate NodeItem elements # tags that generate NodeItem elements
TAGS_FOR_NODE_ITEMS: Final = [ TAGS_FOR_NODE_ITEMS: Final = [
"address",
"h1", "h1",
"h2", "h2",
"h3", "h3",
@ -163,7 +164,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None: def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(tag, doc) self.handle_header(tag, doc)
elif tag.name in ["p"]: elif tag.name in ["p", "address"]:
self.handle_paragraph(tag, doc) self.handle_paragraph(tag, doc)
elif tag.name in ["pre", "code"]: elif tag.name in ["pre", "code"]:
self.handle_code(tag, doc) self.handle_code(tag, doc)
@ -258,7 +259,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
) )
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p).""" """Handles paragraph tags (p) or equivalent ones."""
if element.text is None: if element.text is None:
return return
text = element.text.strip() text = element.text.strip()