feat(html): Support in-line anchor tags in HTML texts (#1659)

* re-implement links for html backend.

Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch>

* fix inline groups in list items. write specific test for find_parent_annotation of _extract_text_and_hyperlink_recursively.

Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch>

* implement hack for images.

Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch>

---------

Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch>
This commit is contained in:
krrome
2025-08-18 09:57:16 +02:00
committed by GitHub
parent 76c1fbd6e8
commit 9687297262
25 changed files with 21059 additions and 4111 deletions

View File

@@ -1,8 +1,11 @@
import logging
import re
from contextlib import contextmanager
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union, cast
from urllib.parse import urljoin
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from bs4.element import PreformattedString
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
TextItem,
)
from docling_core.types.doc.document import ContentLayer
from pydantic import BaseModel
from pydantic import AnyUrl, BaseModel, ValidationError
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -56,12 +59,76 @@ class _Context(BaseModel):
list_start_by_ref: dict[str, int] = {}
class AnnotatedText(BaseModel):
text: str
hyperlink: Union[AnyUrl, Path, None] = None
class AnnotatedTextList(list):
def to_single_text_element(self) -> AnnotatedText:
current_h = None
current_text = ""
for at in self:
t = at.text
h = at.hyperlink
current_text += t.strip() + " "
if h is not None and current_h is None:
current_h = h
elif h is not None and current_h is not None and h != current_h:
_log.warning(
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
)
return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
def simplify_text_elements(self) -> "AnnotatedTextList":
simplified = AnnotatedTextList()
if not self:
return self
text = self[0].text
hyperlink = self[0].hyperlink
last_elm = text
for i in range(1, len(self)):
if hyperlink == self[i].hyperlink:
sep = " "
if not self[i].text.strip() or not last_elm.strip():
sep = ""
text += sep + self[i].text
last_elm = self[i].text
else:
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
text = self[i].text
last_elm = text
hyperlink = self[i].hyperlink
if text:
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
return simplified
def split_by_newline(self):
super_list = []
active_annotated_text_list = AnnotatedTextList()
for el in self:
sub_texts = el.text.split("\n")
if len(sub_texts) == 1:
active_annotated_text_list.append(el)
else:
for text in sub_texts:
sub_el = deepcopy(el)
sub_el.text = text
active_annotated_text_list.append(sub_el)
super_list.append(active_annotated_text_list)
active_annotated_text_list = AnnotatedTextList()
if active_annotated_text_list:
super_list.append(active_annotated_text_list)
return super_list
class HTMLDocumentBackend(DeclarativeDocumentBackend):
@override
def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
original_url: Optional[AnyUrl] = None,
):
super().__init__(in_doc, path_or_stream)
self.soup: Optional[Tag] = None
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.ctx = _Context()
for i in range(self.max_levels):
self.parents[i] = None
self.hyperlink = None
self.original_url = original_url
try:
raw = (
@@ -160,26 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
element: The XML tag to parse.
doc: The Docling document to be updated with the parsed content.
"""
buffer: list[str] = []
buffer: AnnotatedTextList = AnnotatedTextList()
def flush_buffer():
if not buffer:
return
text = "".join(buffer).strip()
annotated_text_list = buffer.simplify_text_elements()
parts = annotated_text_list.split_by_newline()
buffer.clear()
if not text:
if not "".join([el.text for el in annotated_text_list]):
return
for part in text.split("\n"):
seg = part.strip()
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
if seg:
doc.add_text(
label=DocItemLabel.TEXT,
text=seg_clean,
orig=seg,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
for annotated_text_list in parts:
with self.use_inline_group(annotated_text_list, doc):
for annotated_text in annotated_text_list:
if annotated_text.text.strip():
seg_clean = HTMLDocumentBackend._clean_unicode(
annotated_text.text.strip()
)
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=seg_clean,
content_layer=self.content_layer,
hyperlink=annotated_text.hyperlink,
)
for node in element.contents:
if isinstance(node, Tag):
@@ -187,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if name == "img":
flush_buffer()
self._emit_image(node, doc)
elif name == "a":
with self.use_hyperlink(node):
self._walk(node, doc)
elif name in _BLOCK_TAGS:
flush_buffer()
self._handle_block(node, doc)
@@ -194,28 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
flush_buffer()
self._walk(node, doc)
else:
buffer.append(node.text)
buffer.extend(
self._extract_text_and_hyperlink_recursively(
node, find_parent_annotation=True, keep_newlines=True
)
)
elif isinstance(node, NavigableString) and not isinstance(
node, PreformattedString
):
buffer.append(str(node))
if str(node).strip("\n\r") == "":
flush_buffer()
else:
buffer.extend(
self._extract_text_and_hyperlink_recursively(
node, find_parent_annotation=True, keep_newlines=True
)
)
flush_buffer()
def _extract_text_and_hyperlink_recursively(
self,
item: PageElement,
ignore_list=False,
find_parent_annotation=False,
keep_newlines=False,
) -> AnnotatedTextList:
result: AnnotatedTextList = AnnotatedTextList()
# If find_parent_annotation, make sure that we keep track of
# any a-tag that has been present in the DOM-parents already.
if find_parent_annotation:
this_parent = item.parent
while this_parent is not None:
if this_parent.name == "a" and this_parent.get("href"):
with self.use_hyperlink(this_parent):
return self._extract_text_and_hyperlink_recursively(
item, ignore_list
)
this_parent = this_parent.parent
if isinstance(item, PreformattedString):
return AnnotatedTextList()
if isinstance(item, NavigableString):
text = item.strip()
if text:
return AnnotatedTextList(
[AnnotatedText(text=text, hyperlink=self.hyperlink)]
)
if keep_newlines and item.strip("\n\r") == "":
return AnnotatedTextList(
[AnnotatedText(text="\n", hyperlink=self.hyperlink)]
)
return AnnotatedTextList()
tag = cast(Tag, item)
if not ignore_list or (tag.name not in ["ul", "ol"]):
for child in tag:
if isinstance(child, Tag) and child.name == "a":
with self.use_hyperlink(child):
result.extend(
self._extract_text_and_hyperlink_recursively(
child, ignore_list, keep_newlines=keep_newlines
)
)
else:
# Recursively get the child's text content
result.extend(
self._extract_text_and_hyperlink_recursively(
child, ignore_list, keep_newlines=keep_newlines
)
)
return result
@contextmanager
def use_hyperlink(self, tag):
this_href = tag.get("href")
if this_href is None:
yield None
else:
if this_href:
old_hyperlink = self.hyperlink
if self.original_url is not None:
this_href = urljoin(self.original_url, this_href)
# ugly fix for relative links since pydantic does not support them.
try:
AnyUrl(this_href)
except ValidationError:
this_href = Path(this_href)
self.hyperlink = this_href
try:
yield None
finally:
if this_href:
self.hyperlink = old_hyperlink
@contextmanager
def use_inline_group(
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
):
"""Create an inline group for annotated texts.
Checks if annotated_text_list has more than one item and if so creates an inline
group in which the text elements can then be generated. While the context manager
is active the inline group is set as the current parent.
Args:
annotated_text_list (AnnotatedTextList): Annotated text
doc (DoclingDocument): Currently used document
Yields:
None: _description_
"""
if len(annotated_text_list) > 1:
inline_fmt = doc.add_group(
label=GroupLabel.INLINE,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
self.parents[self.level + 1] = inline_fmt
self.level += 1
try:
yield None
finally:
self.parents[self.level] = None
self.level -= 1
else:
yield None
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
tag_name = tag.name.lower()
# set default content layer to BODY as soon as we encounter a heading
self.content_layer = ContentLayer.BODY
level = int(tag_name[1])
text = tag.get_text(strip=True, separator=" ")
text_clean = HTMLDocumentBackend._clean_unicode(text)
annotated_text_list = self._extract_text_and_hyperlink_recursively(
tag, find_parent_annotation=True
)
annotated_text = annotated_text_list.to_single_text_element()
text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
# the first level is for the title item
if level == 1:
for key in self.parents.keys():
self.parents[key] = None
self.level = 0
self.parents[self.level + 1] = doc.add_title(
text=text_clean, orig=text, content_layer=self.content_layer
text_clean,
content_layer=self.content_layer,
hyperlink=annotated_text.hyperlink,
)
# the other levels need to be lowered by 1 if a title was set
else:
@@ -241,9 +445,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[self.level + 1] = doc.add_heading(
parent=self.parents[self.level],
text=text_clean,
orig=text,
orig=annotated_text.text,
level=self.level,
content_layer=self.content_layer,
hyperlink=annotated_text.hyperlink,
)
self.level += 1
for img_tag in tag("img"):
@@ -292,37 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = ""
# 2) extract only the "direct" text from this <li>
parts: list[str] = []
for child in li.contents:
if isinstance(child, NavigableString) and not isinstance(
child, PreformattedString
):
parts.append(child)
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
text_part = HTMLDocumentBackend.get_text(child)
if text_part:
parts.append(text_part)
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
parts = self._extract_text_and_hyperlink_recursively(
li, ignore_list=True, find_parent_annotation=True
)
min_parts = parts.simplify_text_elements()
li_text = re.sub(
r"\s+|\n+", " ", "".join([el.text for el in min_parts])
).strip()
# 3) add the list item
if li_text:
self.parents[self.level + 1] = doc.add_list_item(
text=li_clean,
enumerated=is_ordered,
marker=marker,
orig=li_text,
parent=list_group,
content_layer=self.content_layer,
)
if len(min_parts) > 1:
# create an empty list element in order to hook the inline group onto that one
self.parents[self.level + 1] = doc.add_list_item(
text="",
enumerated=is_ordered,
marker=marker,
parent=list_group,
content_layer=self.content_layer,
)
self.level += 1
with self.use_inline_group(min_parts, doc):
for annotated_text in min_parts:
li_text = re.sub(
r"\s+|\n+", " ", annotated_text.text
).strip()
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=li_clean,
content_layer=self.content_layer,
hyperlink=annotated_text.hyperlink,
)
# 4) recurse into any nested lists, attaching them to this <li> item
for sublist in li({"ul", "ol"}, recursive=False):
if isinstance(sublist, Tag):
self.level += 1
self._handle_block(sublist, doc)
self.parents[self.level + 1] = None
self.level -= 1
# 4) recurse into any nested lists, attaching them to this <li> item
for sublist in li({"ul", "ol"}, recursive=False):
if isinstance(sublist, Tag):
self._handle_block(sublist, doc)
# now the list element with inline group is not a parent anymore
self.parents[self.level] = None
self.level -= 1
else:
annotated_text = min_parts[0]
li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
self.parents[self.level + 1] = doc.add_list_item(
text=li_clean,
enumerated=is_ordered,
marker=marker,
orig=li_text,
parent=list_group,
content_layer=self.content_layer,
hyperlink=annotated_text.hyperlink,
)
# 4) recurse into any nested lists, attaching them to this <li> item
for sublist in li({"ul", "ol"}, recursive=False):
if isinstance(sublist, Tag):
self.level += 1
self._handle_block(sublist, doc)
self.parents[self.level + 1] = None
self.level -= 1
else:
for sublist in li({"ul", "ol"}, recursive=False):
if isinstance(sublist, Tag):
@@ -351,17 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self._handle_list(tag, doc)
elif tag_name in {"p", "address", "summary"}:
for part in tag.text.split("\n"):
seg = part.strip()
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
if seg:
doc.add_text(
label=DocItemLabel.TEXT,
text=seg_clean,
orig=seg,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
text_list = self._extract_text_and_hyperlink_recursively(
tag, find_parent_annotation=True
)
annotated_texts = text_list.simplify_text_elements()
for part in annotated_texts.split_by_newline():
with self.use_inline_group(part, doc):
for annotated_text in part:
if seg := annotated_text.text.strip():
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=seg_clean,
content_layer=self.content_layer,
hyperlink=annotated_text.hyperlink,
)
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
self._emit_image(img_tag, doc)
@@ -380,15 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif tag_name in {"pre", "code"}:
# handle monospace code snippets (pre).
text = tag.get_text(strip=True)
text_clean = HTMLDocumentBackend._clean_unicode(text)
if text:
doc.add_code(
parent=self.parents[self.level],
text=text_clean,
orig=text,
content_layer=self.content_layer,
)
text_list = self._extract_text_and_hyperlink_recursively(
tag, find_parent_annotation=True
)
annotated_texts = text_list.simplify_text_elements()
with self.use_inline_group(annotated_texts, doc):
for annotated_text in annotated_texts:
text_clean = HTMLDocumentBackend._clean_unicode(
annotated_text.text.strip()
)
doc.add_code(
parent=self.parents[self.level],
text=text_clean,
content_layer=self.content_layer,
hyperlink=annotated_text.hyperlink,
)
elif tag_name == "details":
# handle details and its content.
@@ -405,22 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
figure = img_tag.find_parent("figure")
caption: str = ""
caption: AnnotatedTextList = AnnotatedTextList()
# check if the figure has a link - this is HACK:
def get_img_hyperlink(img_tag):
this_parent = img_tag.parent
while this_parent is not None:
if this_parent.name == "a" and this_parent.get("href"):
return this_parent.get("href")
this_parent = this_parent.parent
return None
if img_hyperlink := get_img_hyperlink(img_tag):
caption.append(
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
)
if isinstance(figure, Tag):
caption_tag = figure.find("figcaption", recursive=False)
if isinstance(caption_tag, Tag):
caption = caption_tag.get_text()
if not caption:
caption = str(img_tag.get("alt", "")).strip()
caption = self._extract_text_and_hyperlink_recursively(
caption_tag, find_parent_annotation=True
)
if not caption and img_tag.get("alt"):
caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
caption_anno_text = caption.to_single_text_element()
caption_item: Optional[TextItem] = None
if caption:
caption_clean = HTMLDocumentBackend._clean_unicode(caption)
if caption_anno_text.text:
text_clean = HTMLDocumentBackend._clean_unicode(
caption_anno_text.text.strip()
)
print(caption_anno_text)
caption_item = doc.add_text(
label=DocItemLabel.CAPTION,
text=caption_clean,
orig=caption,
text=text_clean,
orig=caption_anno_text.text,
content_layer=self.content_layer,
hyperlink=caption_anno_text.hyperlink,
)
doc.add_picture(