feat(html): Support in-line anchor tags in HTML texts (#1659)

* re-implement links for html backend. Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch> * fix inline groups in list items. write specific test for find_parent_annotation of _extract_text_and_hyperlink_recursively. Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch> * implement hack for images. Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch> --------- Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch>
2025-12-10 13:48:13 +00:00 · 2025-08-18 09:57:16 +02:00
parent 76c1fbd6e8
commit 9687297262
25 changed files with 21059 additions and 4111 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,8 +1,11 @@
 import logging
 import re
+from contextlib import contextmanager
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union, cast
+from urllib.parse import urljoin

 from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
 from bs4.element import PreformattedString
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
    TextItem,
 )
 from docling_core.types.doc.document import ContentLayer
-from pydantic import BaseModel
+from pydantic import AnyUrl, BaseModel, ValidationError
 from typing_extensions import override

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -56,12 +59,76 @@ class _Context(BaseModel):
    list_start_by_ref: dict[str, int] = {}


+class AnnotatedText(BaseModel):
+    text: str
+    hyperlink: Union[AnyUrl, Path, None] = None
+
+
+class AnnotatedTextList(list):
+    def to_single_text_element(self) -> AnnotatedText:
+        current_h = None
+        current_text = ""
+        for at in self:
+            t = at.text
+            h = at.hyperlink
+            current_text += t.strip() + " "
+            if h is not None and current_h is None:
+                current_h = h
+            elif h is not None and current_h is not None and h != current_h:
+                _log.warning(
+                    f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
+                )
+        return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
+
+    def simplify_text_elements(self) -> "AnnotatedTextList":
+        simplified = AnnotatedTextList()
+        if not self:
+            return self
+        text = self[0].text
+        hyperlink = self[0].hyperlink
+        last_elm = text
+        for i in range(1, len(self)):
+            if hyperlink == self[i].hyperlink:
+                sep = " "
+                if not self[i].text.strip() or not last_elm.strip():
+                    sep = ""
+                text += sep + self[i].text
+                last_elm = self[i].text
+            else:
+                simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
+                text = self[i].text
+                last_elm = text
+                hyperlink = self[i].hyperlink
+        if text:
+            simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
+        return simplified
+
+    def split_by_newline(self):
+        super_list = []
+        active_annotated_text_list = AnnotatedTextList()
+        for el in self:
+            sub_texts = el.text.split("\n")
+            if len(sub_texts) == 1:
+                active_annotated_text_list.append(el)
+            else:
+                for text in sub_texts:
+                    sub_el = deepcopy(el)
+                    sub_el.text = text
+                    active_annotated_text_list.append(sub_el)
+                    super_list.append(active_annotated_text_list)
+                    active_annotated_text_list = AnnotatedTextList()
+        if active_annotated_text_list:
+            super_list.append(active_annotated_text_list)
+        return super_list
+
+
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
    @override
    def __init__(
        self,
        in_doc: InputDocument,
        path_or_stream: Union[BytesIO, Path],
+        original_url: Optional[AnyUrl] = None,
    ):
        super().__init__(in_doc, path_or_stream)
        self.soup: Optional[Tag] = None
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self.ctx = _Context()
        for i in range(self.max_levels):
            self.parents[i] = None
+        self.hyperlink = None
+        self.original_url = original_url

        try:
            raw = (
@@ -160,26 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            element: The XML tag to parse.
            doc: The Docling document to be updated with the parsed content.
        """
-        buffer: list[str] = []
+        buffer: AnnotatedTextList = AnnotatedTextList()

        def flush_buffer():
            if not buffer:
                return
-            text = "".join(buffer).strip()
+            annotated_text_list = buffer.simplify_text_elements()
+            parts = annotated_text_list.split_by_newline()
            buffer.clear()
-            if not text:
+
+            if not "".join([el.text for el in annotated_text_list]):
                return
-            for part in text.split("\n"):
-                seg = part.strip()
-                seg_clean = HTMLDocumentBackend._clean_unicode(seg)
-                if seg:
-                    doc.add_text(
-                        label=DocItemLabel.TEXT,
-                        text=seg_clean,
-                        orig=seg,
-                        parent=self.parents[self.level],
-                        content_layer=self.content_layer,
-                    )
+
+            for annotated_text_list in parts:
+                with self.use_inline_group(annotated_text_list, doc):
+                    for annotated_text in annotated_text_list:
+                        if annotated_text.text.strip():
+                            seg_clean = HTMLDocumentBackend._clean_unicode(
+                                annotated_text.text.strip()
+                            )
+                            doc.add_text(
+                                parent=self.parents[self.level],
+                                label=DocItemLabel.TEXT,
+                                text=seg_clean,
+                                content_layer=self.content_layer,
+                                hyperlink=annotated_text.hyperlink,
+                            )

        for node in element.contents:
            if isinstance(node, Tag):
@@ -187,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                if name == "img":
                    flush_buffer()
                    self._emit_image(node, doc)
+                elif name == "a":
+                    with self.use_hyperlink(node):
+                        self._walk(node, doc)
                elif name in _BLOCK_TAGS:
                    flush_buffer()
                    self._handle_block(node, doc)
@@ -194,28 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    flush_buffer()
                    self._walk(node, doc)
                else:
-                    buffer.append(node.text)
+                    buffer.extend(
+                        self._extract_text_and_hyperlink_recursively(
+                            node, find_parent_annotation=True, keep_newlines=True
+                        )
+                    )
            elif isinstance(node, NavigableString) and not isinstance(
                node, PreformattedString
            ):
-                buffer.append(str(node))
+                if str(node).strip("\n\r") == "":
+                    flush_buffer()
+                else:
+                    buffer.extend(
+                        self._extract_text_and_hyperlink_recursively(
+                            node, find_parent_annotation=True, keep_newlines=True
+                        )
+                    )

        flush_buffer()

+    def _extract_text_and_hyperlink_recursively(
+        self,
+        item: PageElement,
+        ignore_list=False,
+        find_parent_annotation=False,
+        keep_newlines=False,
+    ) -> AnnotatedTextList:
+        result: AnnotatedTextList = AnnotatedTextList()
+
+        # If find_parent_annotation, make sure that we keep track of
+        # any a-tag that has been present in the DOM-parents already.
+        if find_parent_annotation:
+            this_parent = item.parent
+            while this_parent is not None:
+                if this_parent.name == "a" and this_parent.get("href"):
+                    with self.use_hyperlink(this_parent):
+                        return self._extract_text_and_hyperlink_recursively(
+                            item, ignore_list
+                        )
+                this_parent = this_parent.parent
+
+        if isinstance(item, PreformattedString):
+            return AnnotatedTextList()
+
+        if isinstance(item, NavigableString):
+            text = item.strip()
+            if text:
+                return AnnotatedTextList(
+                    [AnnotatedText(text=text, hyperlink=self.hyperlink)]
+                )
+            if keep_newlines and item.strip("\n\r") == "":
+                return AnnotatedTextList(
+                    [AnnotatedText(text="\n", hyperlink=self.hyperlink)]
+                )
+            return AnnotatedTextList()
+
+        tag = cast(Tag, item)
+        if not ignore_list or (tag.name not in ["ul", "ol"]):
+            for child in tag:
+                if isinstance(child, Tag) and child.name == "a":
+                    with self.use_hyperlink(child):
+                        result.extend(
+                            self._extract_text_and_hyperlink_recursively(
+                                child, ignore_list, keep_newlines=keep_newlines
+                            )
+                        )
+                else:
+                    # Recursively get the child's text content
+                    result.extend(
+                        self._extract_text_and_hyperlink_recursively(
+                            child, ignore_list, keep_newlines=keep_newlines
+                        )
+                    )
+        return result
+
+    @contextmanager
+    def use_hyperlink(self, tag):
+        this_href = tag.get("href")
+        if this_href is None:
+            yield None
+        else:
+            if this_href:
+                old_hyperlink = self.hyperlink
+                if self.original_url is not None:
+                    this_href = urljoin(self.original_url, this_href)
+                # ugly fix for relative links since pydantic does not support them.
+                try:
+                    AnyUrl(this_href)
+                except ValidationError:
+                    this_href = Path(this_href)
+                self.hyperlink = this_href
+            try:
+                yield None
+            finally:
+                if this_href:
+                    self.hyperlink = old_hyperlink
+
+    @contextmanager
+    def use_inline_group(
+        self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
+    ):
+        """Create an inline group for annotated texts.
+
+        Checks if annotated_text_list has more than one item and if so creates an inline
+        group in which the text elements can then be generated. While the context manager
+        is active the inline group is set as the current parent.
+
+        Args:
+            annotated_text_list (AnnotatedTextList): Annotated text
+            doc (DoclingDocument): Currently used document
+
+        Yields:
+            None: _description_
+        """
+        if len(annotated_text_list) > 1:
+            inline_fmt = doc.add_group(
+                label=GroupLabel.INLINE,
+                parent=self.parents[self.level],
+                content_layer=self.content_layer,
+            )
+            self.parents[self.level + 1] = inline_fmt
+            self.level += 1
+            try:
+                yield None
+            finally:
+                self.parents[self.level] = None
+                self.level -= 1
+        else:
+            yield None
+
    def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
        tag_name = tag.name.lower()
        # set default content layer to BODY as soon as we encounter a heading
        self.content_layer = ContentLayer.BODY
        level = int(tag_name[1])
-        text = tag.get_text(strip=True, separator=" ")
-        text_clean = HTMLDocumentBackend._clean_unicode(text)
+        annotated_text_list = self._extract_text_and_hyperlink_recursively(
+            tag, find_parent_annotation=True
+        )
+        annotated_text = annotated_text_list.to_single_text_element()
+        text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
        # the first level is for the title item
        if level == 1:
            for key in self.parents.keys():
                self.parents[key] = None
            self.level = 0
            self.parents[self.level + 1] = doc.add_title(
-                text=text_clean, orig=text, content_layer=self.content_layer
+                text_clean,
+                content_layer=self.content_layer,
+                hyperlink=annotated_text.hyperlink,
            )
        # the other levels need to be lowered by 1 if a title was set
        else:
@@ -241,9 +445,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            self.parents[self.level + 1] = doc.add_heading(
                parent=self.parents[self.level],
                text=text_clean,
-                orig=text,
+                orig=annotated_text.text,
                level=self.level,
                content_layer=self.content_layer,
+                hyperlink=annotated_text.hyperlink,
            )
        self.level += 1
        for img_tag in tag("img"):
@@ -292,37 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    marker = ""

                # 2) extract only the "direct" text from this <li>
-                parts: list[str] = []
-                for child in li.contents:
-                    if isinstance(child, NavigableString) and not isinstance(
-                        child, PreformattedString
-                    ):
-                        parts.append(child)
-                    elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
-                        text_part = HTMLDocumentBackend.get_text(child)
-                        if text_part:
-                            parts.append(text_part)
-                li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
-                li_clean = HTMLDocumentBackend._clean_unicode(li_text)
+                parts = self._extract_text_and_hyperlink_recursively(
+                    li, ignore_list=True, find_parent_annotation=True
+                )
+                min_parts = parts.simplify_text_elements()
+                li_text = re.sub(
+                    r"\s+|\n+", " ", "".join([el.text for el in min_parts])
+                ).strip()

                # 3) add the list item
                if li_text:
-                    self.parents[self.level + 1] = doc.add_list_item(
-                        text=li_clean,
-                        enumerated=is_ordered,
-                        marker=marker,
-                        orig=li_text,
-                        parent=list_group,
-                        content_layer=self.content_layer,
-                    )
+                    if len(min_parts) > 1:
+                        # create an empty list element in order to hook the inline group onto that one
+                        self.parents[self.level + 1] = doc.add_list_item(
+                            text="",
+                            enumerated=is_ordered,
+                            marker=marker,
+                            parent=list_group,
+                            content_layer=self.content_layer,
+                        )
+                        self.level += 1
+                        with self.use_inline_group(min_parts, doc):
+                            for annotated_text in min_parts:
+                                li_text = re.sub(
+                                    r"\s+|\n+", " ", annotated_text.text
+                                ).strip()
+                                li_clean = HTMLDocumentBackend._clean_unicode(li_text)
+                                doc.add_text(
+                                    parent=self.parents[self.level],
+                                    label=DocItemLabel.TEXT,
+                                    text=li_clean,
+                                    content_layer=self.content_layer,
+                                    hyperlink=annotated_text.hyperlink,
+                                )

-                    # 4) recurse into any nested lists, attaching them to this <li> item
-                    for sublist in li({"ul", "ol"}, recursive=False):
-                        if isinstance(sublist, Tag):
-                            self.level += 1
-                            self._handle_block(sublist, doc)
-                            self.parents[self.level + 1] = None
-                            self.level -= 1
+                        # 4) recurse into any nested lists, attaching them to this <li> item
+                        for sublist in li({"ul", "ol"}, recursive=False):
+                            if isinstance(sublist, Tag):
+                                self._handle_block(sublist, doc)
+
+                        # now the list element with inline group is not a parent anymore
+                        self.parents[self.level] = None
+                        self.level -= 1
+                    else:
+                        annotated_text = min_parts[0]
+                        li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
+                        li_clean = HTMLDocumentBackend._clean_unicode(li_text)
+                        self.parents[self.level + 1] = doc.add_list_item(
+                            text=li_clean,
+                            enumerated=is_ordered,
+                            marker=marker,
+                            orig=li_text,
+                            parent=list_group,
+                            content_layer=self.content_layer,
+                            hyperlink=annotated_text.hyperlink,
+                        )
+
+                        # 4) recurse into any nested lists, attaching them to this <li> item
+                        for sublist in li({"ul", "ol"}, recursive=False):
+                            if isinstance(sublist, Tag):
+                                self.level += 1
+                                self._handle_block(sublist, doc)
+                                self.parents[self.level + 1] = None
+                                self.level -= 1
                else:
                    for sublist in li({"ul", "ol"}, recursive=False):
                        if isinstance(sublist, Tag):
@@ -351,17 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            self._handle_list(tag, doc)

        elif tag_name in {"p", "address", "summary"}:
-            for part in tag.text.split("\n"):
-                seg = part.strip()
-                seg_clean = HTMLDocumentBackend._clean_unicode(seg)
-                if seg:
-                    doc.add_text(
-                        label=DocItemLabel.TEXT,
-                        text=seg_clean,
-                        orig=seg,
-                        parent=self.parents[self.level],
-                        content_layer=self.content_layer,
-                    )
+            text_list = self._extract_text_and_hyperlink_recursively(
+                tag, find_parent_annotation=True
+            )
+            annotated_texts = text_list.simplify_text_elements()
+            for part in annotated_texts.split_by_newline():
+                with self.use_inline_group(part, doc):
+                    for annotated_text in part:
+                        if seg := annotated_text.text.strip():
+                            seg_clean = HTMLDocumentBackend._clean_unicode(seg)
+                            doc.add_text(
+                                parent=self.parents[self.level],
+                                label=DocItemLabel.TEXT,
+                                text=seg_clean,
+                                content_layer=self.content_layer,
+                                hyperlink=annotated_text.hyperlink,
+                            )
+
            for img_tag in tag("img"):
                if isinstance(img_tag, Tag):
                    self._emit_image(img_tag, doc)
@@ -380,15 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

        elif tag_name in {"pre", "code"}:
            # handle monospace code snippets (pre).
-            text = tag.get_text(strip=True)
-            text_clean = HTMLDocumentBackend._clean_unicode(text)
-            if text:
-                doc.add_code(
-                    parent=self.parents[self.level],
-                    text=text_clean,
-                    orig=text,
-                    content_layer=self.content_layer,
-                )
+            text_list = self._extract_text_and_hyperlink_recursively(
+                tag, find_parent_annotation=True
+            )
+            annotated_texts = text_list.simplify_text_elements()
+            with self.use_inline_group(annotated_texts, doc):
+                for annotated_text in annotated_texts:
+                    text_clean = HTMLDocumentBackend._clean_unicode(
+                        annotated_text.text.strip()
+                    )
+                    doc.add_code(
+                        parent=self.parents[self.level],
+                        text=text_clean,
+                        content_layer=self.content_layer,
+                        hyperlink=annotated_text.hyperlink,
+                    )

        elif tag_name == "details":
            # handle details and its content.
@@ -405,22 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

    def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
        figure = img_tag.find_parent("figure")
-        caption: str = ""
+        caption: AnnotatedTextList = AnnotatedTextList()
+
+        # check if the figure has a link - this is HACK:
+        def get_img_hyperlink(img_tag):
+            this_parent = img_tag.parent
+            while this_parent is not None:
+                if this_parent.name == "a" and this_parent.get("href"):
+                    return this_parent.get("href")
+                this_parent = this_parent.parent
+            return None
+
+        if img_hyperlink := get_img_hyperlink(img_tag):
+            caption.append(
+                AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
+            )
+
        if isinstance(figure, Tag):
            caption_tag = figure.find("figcaption", recursive=False)
            if isinstance(caption_tag, Tag):
-                caption = caption_tag.get_text()
-        if not caption:
-            caption = str(img_tag.get("alt", "")).strip()
+                caption = self._extract_text_and_hyperlink_recursively(
+                    caption_tag, find_parent_annotation=True
+                )
+        if not caption and img_tag.get("alt"):
+            caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
+
+        caption_anno_text = caption.to_single_text_element()

        caption_item: Optional[TextItem] = None
-        if caption:
-            caption_clean = HTMLDocumentBackend._clean_unicode(caption)
+        if caption_anno_text.text:
+            text_clean = HTMLDocumentBackend._clean_unicode(
+                caption_anno_text.text.strip()
+            )
+            print(caption_anno_text)
            caption_item = doc.add_text(
                label=DocItemLabel.CAPTION,
-                text=caption_clean,
-                orig=caption,
+                text=text_clean,
+                orig=caption_anno_text.text,
                content_layer=self.content_layer,
+                hyperlink=caption_anno_text.hyperlink,
            )

        doc.add_picture(