mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
feat(html): Support in-line anchor tags in HTML texts (#1659)
* re-implement links for html backend. Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch> * fix inline groups in list items. write specific test for find_parent_annotation of _extract_text_and_hyperlink_recursively. Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch> * implement hack for images. Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch> --------- Signed-off-by: Roman Kayan BAZG <roman.kayan@bazg.admin.ch>
This commit is contained in:
@@ -1,8 +1,11 @@
|
||||
import logging
|
||||
import re
|
||||
from contextlib import contextmanager
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Final, Optional, Union, cast
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
||||
from bs4.element import PreformattedString
|
||||
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
from pydantic import BaseModel
|
||||
from pydantic import AnyUrl, BaseModel, ValidationError
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
@@ -56,12 +59,76 @@ class _Context(BaseModel):
|
||||
list_start_by_ref: dict[str, int] = {}
|
||||
|
||||
|
||||
class AnnotatedText(BaseModel):
|
||||
text: str
|
||||
hyperlink: Union[AnyUrl, Path, None] = None
|
||||
|
||||
|
||||
class AnnotatedTextList(list):
|
||||
def to_single_text_element(self) -> AnnotatedText:
|
||||
current_h = None
|
||||
current_text = ""
|
||||
for at in self:
|
||||
t = at.text
|
||||
h = at.hyperlink
|
||||
current_text += t.strip() + " "
|
||||
if h is not None and current_h is None:
|
||||
current_h = h
|
||||
elif h is not None and current_h is not None and h != current_h:
|
||||
_log.warning(
|
||||
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
||||
)
|
||||
return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
|
||||
|
||||
def simplify_text_elements(self) -> "AnnotatedTextList":
|
||||
simplified = AnnotatedTextList()
|
||||
if not self:
|
||||
return self
|
||||
text = self[0].text
|
||||
hyperlink = self[0].hyperlink
|
||||
last_elm = text
|
||||
for i in range(1, len(self)):
|
||||
if hyperlink == self[i].hyperlink:
|
||||
sep = " "
|
||||
if not self[i].text.strip() or not last_elm.strip():
|
||||
sep = ""
|
||||
text += sep + self[i].text
|
||||
last_elm = self[i].text
|
||||
else:
|
||||
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
||||
text = self[i].text
|
||||
last_elm = text
|
||||
hyperlink = self[i].hyperlink
|
||||
if text:
|
||||
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
||||
return simplified
|
||||
|
||||
def split_by_newline(self):
|
||||
super_list = []
|
||||
active_annotated_text_list = AnnotatedTextList()
|
||||
for el in self:
|
||||
sub_texts = el.text.split("\n")
|
||||
if len(sub_texts) == 1:
|
||||
active_annotated_text_list.append(el)
|
||||
else:
|
||||
for text in sub_texts:
|
||||
sub_el = deepcopy(el)
|
||||
sub_el.text = text
|
||||
active_annotated_text_list.append(sub_el)
|
||||
super_list.append(active_annotated_text_list)
|
||||
active_annotated_text_list = AnnotatedTextList()
|
||||
if active_annotated_text_list:
|
||||
super_list.append(active_annotated_text_list)
|
||||
return super_list
|
||||
|
||||
|
||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
@override
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: InputDocument,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
original_url: Optional[AnyUrl] = None,
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.soup: Optional[Tag] = None
|
||||
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.ctx = _Context()
|
||||
for i in range(self.max_levels):
|
||||
self.parents[i] = None
|
||||
self.hyperlink = None
|
||||
self.original_url = original_url
|
||||
|
||||
try:
|
||||
raw = (
|
||||
@@ -160,26 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
element: The XML tag to parse.
|
||||
doc: The Docling document to be updated with the parsed content.
|
||||
"""
|
||||
buffer: list[str] = []
|
||||
buffer: AnnotatedTextList = AnnotatedTextList()
|
||||
|
||||
def flush_buffer():
|
||||
if not buffer:
|
||||
return
|
||||
text = "".join(buffer).strip()
|
||||
annotated_text_list = buffer.simplify_text_elements()
|
||||
parts = annotated_text_list.split_by_newline()
|
||||
buffer.clear()
|
||||
if not text:
|
||||
|
||||
if not "".join([el.text for el in annotated_text_list]):
|
||||
return
|
||||
for part in text.split("\n"):
|
||||
seg = part.strip()
|
||||
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
||||
if seg:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
text=seg_clean,
|
||||
orig=seg,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
for annotated_text_list in parts:
|
||||
with self.use_inline_group(annotated_text_list, doc):
|
||||
for annotated_text in annotated_text_list:
|
||||
if annotated_text.text.strip():
|
||||
seg_clean = HTMLDocumentBackend._clean_unicode(
|
||||
annotated_text.text.strip()
|
||||
)
|
||||
doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TEXT,
|
||||
text=seg_clean,
|
||||
content_layer=self.content_layer,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
|
||||
for node in element.contents:
|
||||
if isinstance(node, Tag):
|
||||
@@ -187,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if name == "img":
|
||||
flush_buffer()
|
||||
self._emit_image(node, doc)
|
||||
elif name == "a":
|
||||
with self.use_hyperlink(node):
|
||||
self._walk(node, doc)
|
||||
elif name in _BLOCK_TAGS:
|
||||
flush_buffer()
|
||||
self._handle_block(node, doc)
|
||||
@@ -194,28 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
flush_buffer()
|
||||
self._walk(node, doc)
|
||||
else:
|
||||
buffer.append(node.text)
|
||||
buffer.extend(
|
||||
self._extract_text_and_hyperlink_recursively(
|
||||
node, find_parent_annotation=True, keep_newlines=True
|
||||
)
|
||||
)
|
||||
elif isinstance(node, NavigableString) and not isinstance(
|
||||
node, PreformattedString
|
||||
):
|
||||
buffer.append(str(node))
|
||||
if str(node).strip("\n\r") == "":
|
||||
flush_buffer()
|
||||
else:
|
||||
buffer.extend(
|
||||
self._extract_text_and_hyperlink_recursively(
|
||||
node, find_parent_annotation=True, keep_newlines=True
|
||||
)
|
||||
)
|
||||
|
||||
flush_buffer()
|
||||
|
||||
def _extract_text_and_hyperlink_recursively(
|
||||
self,
|
||||
item: PageElement,
|
||||
ignore_list=False,
|
||||
find_parent_annotation=False,
|
||||
keep_newlines=False,
|
||||
) -> AnnotatedTextList:
|
||||
result: AnnotatedTextList = AnnotatedTextList()
|
||||
|
||||
# If find_parent_annotation, make sure that we keep track of
|
||||
# any a-tag that has been present in the DOM-parents already.
|
||||
if find_parent_annotation:
|
||||
this_parent = item.parent
|
||||
while this_parent is not None:
|
||||
if this_parent.name == "a" and this_parent.get("href"):
|
||||
with self.use_hyperlink(this_parent):
|
||||
return self._extract_text_and_hyperlink_recursively(
|
||||
item, ignore_list
|
||||
)
|
||||
this_parent = this_parent.parent
|
||||
|
||||
if isinstance(item, PreformattedString):
|
||||
return AnnotatedTextList()
|
||||
|
||||
if isinstance(item, NavigableString):
|
||||
text = item.strip()
|
||||
if text:
|
||||
return AnnotatedTextList(
|
||||
[AnnotatedText(text=text, hyperlink=self.hyperlink)]
|
||||
)
|
||||
if keep_newlines and item.strip("\n\r") == "":
|
||||
return AnnotatedTextList(
|
||||
[AnnotatedText(text="\n", hyperlink=self.hyperlink)]
|
||||
)
|
||||
return AnnotatedTextList()
|
||||
|
||||
tag = cast(Tag, item)
|
||||
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
||||
for child in tag:
|
||||
if isinstance(child, Tag) and child.name == "a":
|
||||
with self.use_hyperlink(child):
|
||||
result.extend(
|
||||
self._extract_text_and_hyperlink_recursively(
|
||||
child, ignore_list, keep_newlines=keep_newlines
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Recursively get the child's text content
|
||||
result.extend(
|
||||
self._extract_text_and_hyperlink_recursively(
|
||||
child, ignore_list, keep_newlines=keep_newlines
|
||||
)
|
||||
)
|
||||
return result
|
||||
|
||||
@contextmanager
|
||||
def use_hyperlink(self, tag):
|
||||
this_href = tag.get("href")
|
||||
if this_href is None:
|
||||
yield None
|
||||
else:
|
||||
if this_href:
|
||||
old_hyperlink = self.hyperlink
|
||||
if self.original_url is not None:
|
||||
this_href = urljoin(self.original_url, this_href)
|
||||
# ugly fix for relative links since pydantic does not support them.
|
||||
try:
|
||||
AnyUrl(this_href)
|
||||
except ValidationError:
|
||||
this_href = Path(this_href)
|
||||
self.hyperlink = this_href
|
||||
try:
|
||||
yield None
|
||||
finally:
|
||||
if this_href:
|
||||
self.hyperlink = old_hyperlink
|
||||
|
||||
@contextmanager
|
||||
def use_inline_group(
|
||||
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
||||
):
|
||||
"""Create an inline group for annotated texts.
|
||||
|
||||
Checks if annotated_text_list has more than one item and if so creates an inline
|
||||
group in which the text elements can then be generated. While the context manager
|
||||
is active the inline group is set as the current parent.
|
||||
|
||||
Args:
|
||||
annotated_text_list (AnnotatedTextList): Annotated text
|
||||
doc (DoclingDocument): Currently used document
|
||||
|
||||
Yields:
|
||||
None: _description_
|
||||
"""
|
||||
if len(annotated_text_list) > 1:
|
||||
inline_fmt = doc.add_group(
|
||||
label=GroupLabel.INLINE,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
self.parents[self.level + 1] = inline_fmt
|
||||
self.level += 1
|
||||
try:
|
||||
yield None
|
||||
finally:
|
||||
self.parents[self.level] = None
|
||||
self.level -= 1
|
||||
else:
|
||||
yield None
|
||||
|
||||
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
tag_name = tag.name.lower()
|
||||
# set default content layer to BODY as soon as we encounter a heading
|
||||
self.content_layer = ContentLayer.BODY
|
||||
level = int(tag_name[1])
|
||||
text = tag.get_text(strip=True, separator=" ")
|
||||
text_clean = HTMLDocumentBackend._clean_unicode(text)
|
||||
annotated_text_list = self._extract_text_and_hyperlink_recursively(
|
||||
tag, find_parent_annotation=True
|
||||
)
|
||||
annotated_text = annotated_text_list.to_single_text_element()
|
||||
text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
|
||||
# the first level is for the title item
|
||||
if level == 1:
|
||||
for key in self.parents.keys():
|
||||
self.parents[key] = None
|
||||
self.level = 0
|
||||
self.parents[self.level + 1] = doc.add_title(
|
||||
text=text_clean, orig=text, content_layer=self.content_layer
|
||||
text_clean,
|
||||
content_layer=self.content_layer,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
# the other levels need to be lowered by 1 if a title was set
|
||||
else:
|
||||
@@ -241,9 +445,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.parents[self.level + 1] = doc.add_heading(
|
||||
parent=self.parents[self.level],
|
||||
text=text_clean,
|
||||
orig=text,
|
||||
orig=annotated_text.text,
|
||||
level=self.level,
|
||||
content_layer=self.content_layer,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
self.level += 1
|
||||
for img_tag in tag("img"):
|
||||
@@ -292,37 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
marker = ""
|
||||
|
||||
# 2) extract only the "direct" text from this <li>
|
||||
parts: list[str] = []
|
||||
for child in li.contents:
|
||||
if isinstance(child, NavigableString) and not isinstance(
|
||||
child, PreformattedString
|
||||
):
|
||||
parts.append(child)
|
||||
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
|
||||
text_part = HTMLDocumentBackend.get_text(child)
|
||||
if text_part:
|
||||
parts.append(text_part)
|
||||
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
||||
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
||||
parts = self._extract_text_and_hyperlink_recursively(
|
||||
li, ignore_list=True, find_parent_annotation=True
|
||||
)
|
||||
min_parts = parts.simplify_text_elements()
|
||||
li_text = re.sub(
|
||||
r"\s+|\n+", " ", "".join([el.text for el in min_parts])
|
||||
).strip()
|
||||
|
||||
# 3) add the list item
|
||||
if li_text:
|
||||
self.parents[self.level + 1] = doc.add_list_item(
|
||||
text=li_clean,
|
||||
enumerated=is_ordered,
|
||||
marker=marker,
|
||||
orig=li_text,
|
||||
parent=list_group,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
if len(min_parts) > 1:
|
||||
# create an empty list element in order to hook the inline group onto that one
|
||||
self.parents[self.level + 1] = doc.add_list_item(
|
||||
text="",
|
||||
enumerated=is_ordered,
|
||||
marker=marker,
|
||||
parent=list_group,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
self.level += 1
|
||||
with self.use_inline_group(min_parts, doc):
|
||||
for annotated_text in min_parts:
|
||||
li_text = re.sub(
|
||||
r"\s+|\n+", " ", annotated_text.text
|
||||
).strip()
|
||||
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
||||
doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TEXT,
|
||||
text=li_clean,
|
||||
content_layer=self.content_layer,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
|
||||
# 4) recurse into any nested lists, attaching them to this <li> item
|
||||
for sublist in li({"ul", "ol"}, recursive=False):
|
||||
if isinstance(sublist, Tag):
|
||||
self.level += 1
|
||||
self._handle_block(sublist, doc)
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
# 4) recurse into any nested lists, attaching them to this <li> item
|
||||
for sublist in li({"ul", "ol"}, recursive=False):
|
||||
if isinstance(sublist, Tag):
|
||||
self._handle_block(sublist, doc)
|
||||
|
||||
# now the list element with inline group is not a parent anymore
|
||||
self.parents[self.level] = None
|
||||
self.level -= 1
|
||||
else:
|
||||
annotated_text = min_parts[0]
|
||||
li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
|
||||
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
||||
self.parents[self.level + 1] = doc.add_list_item(
|
||||
text=li_clean,
|
||||
enumerated=is_ordered,
|
||||
marker=marker,
|
||||
orig=li_text,
|
||||
parent=list_group,
|
||||
content_layer=self.content_layer,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
|
||||
# 4) recurse into any nested lists, attaching them to this <li> item
|
||||
for sublist in li({"ul", "ol"}, recursive=False):
|
||||
if isinstance(sublist, Tag):
|
||||
self.level += 1
|
||||
self._handle_block(sublist, doc)
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
else:
|
||||
for sublist in li({"ul", "ol"}, recursive=False):
|
||||
if isinstance(sublist, Tag):
|
||||
@@ -351,17 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._handle_list(tag, doc)
|
||||
|
||||
elif tag_name in {"p", "address", "summary"}:
|
||||
for part in tag.text.split("\n"):
|
||||
seg = part.strip()
|
||||
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
||||
if seg:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
text=seg_clean,
|
||||
orig=seg,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
text_list = self._extract_text_and_hyperlink_recursively(
|
||||
tag, find_parent_annotation=True
|
||||
)
|
||||
annotated_texts = text_list.simplify_text_elements()
|
||||
for part in annotated_texts.split_by_newline():
|
||||
with self.use_inline_group(part, doc):
|
||||
for annotated_text in part:
|
||||
if seg := annotated_text.text.strip():
|
||||
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
||||
doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TEXT,
|
||||
text=seg_clean,
|
||||
content_layer=self.content_layer,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
|
||||
for img_tag in tag("img"):
|
||||
if isinstance(img_tag, Tag):
|
||||
self._emit_image(img_tag, doc)
|
||||
@@ -380,15 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif tag_name in {"pre", "code"}:
|
||||
# handle monospace code snippets (pre).
|
||||
text = tag.get_text(strip=True)
|
||||
text_clean = HTMLDocumentBackend._clean_unicode(text)
|
||||
if text:
|
||||
doc.add_code(
|
||||
parent=self.parents[self.level],
|
||||
text=text_clean,
|
||||
orig=text,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
text_list = self._extract_text_and_hyperlink_recursively(
|
||||
tag, find_parent_annotation=True
|
||||
)
|
||||
annotated_texts = text_list.simplify_text_elements()
|
||||
with self.use_inline_group(annotated_texts, doc):
|
||||
for annotated_text in annotated_texts:
|
||||
text_clean = HTMLDocumentBackend._clean_unicode(
|
||||
annotated_text.text.strip()
|
||||
)
|
||||
doc.add_code(
|
||||
parent=self.parents[self.level],
|
||||
text=text_clean,
|
||||
content_layer=self.content_layer,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
|
||||
elif tag_name == "details":
|
||||
# handle details and its content.
|
||||
@@ -405,22 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
||||
figure = img_tag.find_parent("figure")
|
||||
caption: str = ""
|
||||
caption: AnnotatedTextList = AnnotatedTextList()
|
||||
|
||||
# check if the figure has a link - this is HACK:
|
||||
def get_img_hyperlink(img_tag):
|
||||
this_parent = img_tag.parent
|
||||
while this_parent is not None:
|
||||
if this_parent.name == "a" and this_parent.get("href"):
|
||||
return this_parent.get("href")
|
||||
this_parent = this_parent.parent
|
||||
return None
|
||||
|
||||
if img_hyperlink := get_img_hyperlink(img_tag):
|
||||
caption.append(
|
||||
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
|
||||
)
|
||||
|
||||
if isinstance(figure, Tag):
|
||||
caption_tag = figure.find("figcaption", recursive=False)
|
||||
if isinstance(caption_tag, Tag):
|
||||
caption = caption_tag.get_text()
|
||||
if not caption:
|
||||
caption = str(img_tag.get("alt", "")).strip()
|
||||
caption = self._extract_text_and_hyperlink_recursively(
|
||||
caption_tag, find_parent_annotation=True
|
||||
)
|
||||
if not caption and img_tag.get("alt"):
|
||||
caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
|
||||
|
||||
caption_anno_text = caption.to_single_text_element()
|
||||
|
||||
caption_item: Optional[TextItem] = None
|
||||
if caption:
|
||||
caption_clean = HTMLDocumentBackend._clean_unicode(caption)
|
||||
if caption_anno_text.text:
|
||||
text_clean = HTMLDocumentBackend._clean_unicode(
|
||||
caption_anno_text.text.strip()
|
||||
)
|
||||
print(caption_anno_text)
|
||||
caption_item = doc.add_text(
|
||||
label=DocItemLabel.CAPTION,
|
||||
text=caption_clean,
|
||||
orig=caption,
|
||||
text=text_clean,
|
||||
orig=caption_anno_text.text,
|
||||
content_layer=self.content_layer,
|
||||
hyperlink=caption_anno_text.hyperlink,
|
||||
)
|
||||
|
||||
doc.add_picture(
|
||||
|
||||
Reference in New Issue
Block a user