mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
fix(html): handle 'address' tag
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
c2470ed216
commit
78f56c1e38
@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
# tags that generate NodeItem elements
|
# tags that generate NodeItem elements
|
||||||
TAGS_FOR_NODE_ITEMS: Final = [
|
TAGS_FOR_NODE_ITEMS: Final = [
|
||||||
|
"address",
|
||||||
"h1",
|
"h1",
|
||||||
"h2",
|
"h2",
|
||||||
"h3",
|
"h3",
|
||||||
@ -163,7 +164,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||||
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||||
self.handle_header(tag, doc)
|
self.handle_header(tag, doc)
|
||||||
elif tag.name in ["p"]:
|
elif tag.name in ["p", "address"]:
|
||||||
self.handle_paragraph(tag, doc)
|
self.handle_paragraph(tag, doc)
|
||||||
elif tag.name in ["pre", "code"]:
|
elif tag.name in ["pre", "code"]:
|
||||||
self.handle_code(tag, doc)
|
self.handle_code(tag, doc)
|
||||||
@ -258,7 +259,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
"""Handles paragraph tags (p)."""
|
"""Handles paragraph tags (p) or equivalent ones."""
|
||||||
if element.text is None:
|
if element.text is None:
|
||||||
return
|
return
|
||||||
text = element.text.strip()
|
text = element.text.strip()
|
||||||
|
Loading…
Reference in New Issue
Block a user