From 78f56c1e380a41651961918149b275d136d988ac Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Tue, 22 Apr 2025 14:41:33 +0200 Subject: [PATCH] fix(html): handle 'address' tag Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index aa2637f2..91d40d3f 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -26,6 +26,7 @@ _log = logging.getLogger(__name__) # tags that generate NodeItem elements TAGS_FOR_NODE_ITEMS: Final = [ + "address", "h1", "h2", "h3", @@ -163,7 +164,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None: if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: self.handle_header(tag, doc) - elif tag.name in ["p"]: + elif tag.name in ["p", "address"]: self.handle_paragraph(tag, doc) elif tag.name in ["pre", "code"]: self.handle_code(tag, doc) @@ -258,7 +259,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ) def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: - """Handles paragraph tags (p).""" + """Handles paragraph tags (p) or equivalent ones.""" if element.text is None: return text = element.text.strip()