feat(html): Support formatting tags in HTML texts (#2111)

* add parsing for formatting tags in HTML backend

Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch>

fix latest tests + wiki_duck result files.

Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch>

* convert _collect_parent_format_tags to staticmethod

Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch>

---------

Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch>
This commit is contained in:
krrome
2025-08-22 10:37:34 +02:00
committed by GitHub
parent e76298c40d
commit 94fcc46aa9
15 changed files with 9420 additions and 4456 deletions

View File

@@ -20,7 +20,7 @@ from docling_core.types.doc import (
TableData,
TextItem,
)
from docling_core.types.doc.document import ContentLayer
from docling_core.types.doc.document import ContentLayer, Formatting, Script
from pydantic import AnyUrl, BaseModel, ValidationError
from typing_extensions import override
@@ -54,6 +54,21 @@ _BLOCK_TAGS: Final = {
"table",
}
_FORMAT_TAG_MAP: Final = {
"b": {"bold": True},
"strong": {"bold": True},
"i": {"italic": True},
"em": {"italic": True},
# "mark",
# "small",
"s": {"strikethrough": True},
"del": {"strikethrough": True},
"u": {"underline": True},
"ins": {"underline": True},
"sub": {"script": Script.SUB},
"sup": {"script": Script.SUPER},
}
class _Context(BaseModel):
list_ordered_flag_by_ref: dict[str, bool] = {}
@@ -63,23 +78,34 @@ class _Context(BaseModel):
class AnnotatedText(BaseModel):
text: str
hyperlink: Union[AnyUrl, Path, None] = None
formatting: Union[Formatting, None] = None
class AnnotatedTextList(list):
def to_single_text_element(self) -> AnnotatedText:
current_h = None
current_text = ""
current_f = None
for at in self:
t = at.text
h = at.hyperlink
f = at.formatting
current_text += t.strip() + " "
if f is not None and current_f is None:
current_f = f
elif f is not None and current_f is not None and f != current_f:
_log.warning(
f"Clashing formatting: '{f}' and '{current_f}'! Chose '{current_f}'"
)
if h is not None and current_h is None:
current_h = h
elif h is not None and current_h is not None and h != current_h:
_log.warning(
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
)
return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
return AnnotatedText(
text=current_text.strip(), hyperlink=current_h, formatting=current_f
)
def simplify_text_elements(self) -> "AnnotatedTextList":
simplified = AnnotatedTextList()
@@ -87,21 +113,27 @@ class AnnotatedTextList(list):
return self
text = self[0].text
hyperlink = self[0].hyperlink
formatting = self[0].formatting
last_elm = text
for i in range(1, len(self)):
if hyperlink == self[i].hyperlink:
if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
sep = " "
if not self[i].text.strip() or not last_elm.strip():
sep = ""
text += sep + self[i].text
last_elm = self[i].text
else:
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
simplified.append(
AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
)
text = self[i].text
last_elm = text
hyperlink = self[i].hyperlink
formatting = self[i].formatting
if text:
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
simplified.append(
AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
)
return simplified
def split_by_newline(self):
@@ -144,6 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[i] = None
self.hyperlink = None
self.original_url = original_url
self.format_tags: list[str] = []
try:
raw = (
@@ -254,6 +287,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
label=DocItemLabel.TEXT,
text=seg_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
@@ -263,6 +297,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if name == "img":
flush_buffer()
self._emit_image(node, doc)
elif name in _FORMAT_TAG_MAP:
with self.use_format([name]):
self._walk(node, doc)
elif name == "a":
with self.use_hyperlink(node):
self._walk(node, doc)
@@ -292,6 +329,27 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
flush_buffer()
@staticmethod
def _collect_parent_format_tags(item: PageElement) -> list[str]:
tags = []
for format_tag in _FORMAT_TAG_MAP:
this_parent = item.parent
while this_parent is not None:
if this_parent.name == format_tag:
tags.append(format_tag)
break
this_parent = this_parent.parent
return tags
@property
def _formatting(self):
kwargs = {}
for t in self.format_tags:
kwargs.update(_FORMAT_TAG_MAP[t])
if not kwargs:
return None
return Formatting(**kwargs)
def _extract_text_and_hyperlink_recursively(
self,
item: PageElement,
@@ -302,15 +360,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
result: AnnotatedTextList = AnnotatedTextList()
# If find_parent_annotation, make sure that we keep track of
# any a-tag that has been present in the DOM-parents already.
# any a- or formatting-tag that has been present in the
# DOM-parents already.
if find_parent_annotation:
format_tags = self._collect_parent_format_tags(item)
this_parent = item.parent
while this_parent is not None:
if this_parent.name == "a" and this_parent.get("href"):
with self.use_hyperlink(this_parent):
return self._extract_text_and_hyperlink_recursively(
item, ignore_list
)
with self.use_format(format_tags):
with self.use_hyperlink(this_parent):
return self._extract_text_and_hyperlink_recursively(
item, ignore_list
)
this_parent = this_parent.parent
if isinstance(item, PreformattedString):
@@ -320,18 +381,37 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text = item.strip()
if text:
return AnnotatedTextList(
[AnnotatedText(text=text, hyperlink=self.hyperlink)]
[
AnnotatedText(
text=text,
hyperlink=self.hyperlink,
formatting=self._formatting,
)
]
)
if keep_newlines and item.strip("\n\r") == "":
return AnnotatedTextList(
[AnnotatedText(text="\n", hyperlink=self.hyperlink)]
[
AnnotatedText(
text="\n",
hyperlink=self.hyperlink,
formatting=self._formatting,
)
]
)
return AnnotatedTextList()
tag = cast(Tag, item)
if not ignore_list or (tag.name not in ["ul", "ol"]):
for child in tag:
if isinstance(child, Tag) and child.name == "a":
if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
with self.use_format([child.name]):
result.extend(
self._extract_text_and_hyperlink_recursively(
child, ignore_list, keep_newlines=keep_newlines
)
)
elif isinstance(child, Tag) and child.name == "a":
with self.use_hyperlink(child):
result.extend(
self._extract_text_and_hyperlink_recursively(
@@ -369,6 +449,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if this_href:
self.hyperlink = old_hyperlink
@contextmanager
def use_format(self, tags: list[str]):
if not tags:
yield None
else:
self.format_tags.extend(tags)
try:
yield None
finally:
self.format_tags = self.format_tags[: -len(tags)]
@contextmanager
def use_inline_group(
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
@@ -420,6 +511,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[self.level + 1] = doc.add_title(
text_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
# the other levels need to be lowered by 1 if a title was set
@@ -449,6 +541,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
orig=annotated_text.text,
level=self.level,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
self.level += 1
@@ -529,6 +622,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
label=DocItemLabel.TEXT,
text=li_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
@@ -551,6 +645,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
orig=li_text,
parent=list_group,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
@@ -603,6 +698,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
label=DocItemLabel.TEXT,
text=seg_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
@@ -637,6 +733,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[self.level],
text=text_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
@@ -696,6 +793,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text=text_clean,
orig=caption_anno_text.text,
content_layer=self.content_layer,
formatting=caption_anno_text.formatting,
hyperlink=caption_anno_text.hyperlink,
)