refactor(HTML): handle text from styled html (#1960)

* A new HTML backend that handles styled html (ignors it) as well as images.

Images are parsed as placeholders with a caption, if it exists.

Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: vaaale <2428222+vaaale@users.noreply.github.com>
Signed-off-by: Alexander Vaagan <alexander.vaagan@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com>

* tests(HTML): re-enable test_ordered_lists

Re-enable test_ordered_lists regression test for the HTML backend since
docling-core now supports ordered lists with custom start value.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Alexander Vaagan <alexander.vaagan@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com>
Co-authored-by: Alexander Vaagan <2428222+vaaale@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-07-22 13:16:31 +02:00
committed by GitHub
parent 5d98bcea1b
commit a069b1175b
15 changed files with 3241 additions and 2183 deletions

View File

@@ -1,10 +1,11 @@
import logging
import re
import traceback
from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union, cast
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from bs4 import BeautifulSoup, NavigableString, Tag
from bs4.element import PreformattedString
from docling_core.types.doc import (
DocItem,
@@ -15,6 +16,7 @@ from docling_core.types.doc import (
GroupLabel,
TableCell,
TableData,
TextItem,
)
from docling_core.types.doc.document import ContentLayer
from pydantic import BaseModel
@@ -26,10 +28,14 @@ from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
# tags that generate NodeItem elements
TAGS_FOR_NODE_ITEMS: Final = [
DEFAULT_IMAGE_WIDTH = 128
DEFAULT_IMAGE_HEIGHT = 128
# Tags that initiate distinct Docling items
_BLOCK_TAGS: Final = {
"address",
"details",
"figure",
"h1",
"h2",
"h3",
@@ -41,12 +47,9 @@ TAGS_FOR_NODE_ITEMS: Final = [
"code",
"ul",
"ol",
"li",
"summary",
"table",
"figure",
"img",
]
}
class _Context(BaseModel):
@@ -56,12 +59,16 @@ class _Context(BaseModel):
class HTMLDocumentBackend(DeclarativeDocumentBackend):
@override
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
):
super().__init__(in_doc, path_or_stream)
self.soup: Optional[Tag] = None
# HTML file:
self.path_or_stream = path_or_stream
# Initialise the parents for the hierarchy
# Initialize the parents for the hierarchy
self.max_levels = 10
self.level = 0
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
@@ -70,13 +77,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[i] = None
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue()
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "rb") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
raw = (
path_or_stream.getvalue()
if isinstance(path_or_stream, BytesIO)
else Path(path_or_stream).read_bytes()
)
self.soup = BeautifulSoup(raw, "html.parser")
except Exception as e:
raise RuntimeError(
"Could not initialize HTML backend for file with "
@@ -96,7 +102,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
@@ -106,211 +111,156 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
@override
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
_log.debug("Starting HTML conversion...")
if not self.is_valid():
raise RuntimeError("Invalid HTML document.")
origin = DocumentOrigin(
filename=self.file.name or "file",
mimetype="text/html",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
_log.debug("Trying to convert HTML...")
if self.is_valid():
assert self.soup is not None
content = self.soup.body or self.soup
# Replace <br> tags with newline characters
# TODO: remove style to avoid losing text from tags like i, b, span, ...
for br in content("br"):
br.replace_with(NavigableString("\n"))
assert self.soup is not None
# set the title as furniture, since it is part of the document metadata
title = self.soup.title
if title:
doc.add_title(
text=title.get_text(separator=" ", strip=True),
content_layer=ContentLayer.FURNITURE,
)
# remove scripts/styles
for tag in self.soup(["script", "style"]):
tag.decompose()
content = self.soup.body or self.soup
# normalize <br> tags
for br in content("br"):
br.replace_with(NavigableString("\n"))
# set default content layer
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
self.content_layer = (
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
)
# reset context
self.ctx = _Context()
try:
self._walk(content, doc)
except Exception:
print(traceback.format_exc())
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
self.content_layer = (
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
)
self.ctx = _Context() # reset context
self.walk(content, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend "
"failed to init."
)
return doc
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
# Iterate over elements in the body of the document
text: str = ""
for element in tag.children:
if isinstance(element, Tag):
try:
self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child:
_log.error(
f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
)
raise exc_child
elif isinstance(element, NavigableString) and not isinstance(
element, PreformattedString
):
# Floating text outside paragraphs or analyzed tags
text += element
siblings: list[Tag] = [
item for item in element.next_siblings if isinstance(item, Tag)
]
if element.next_sibling is None or any(
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
):
text = text.strip()
if text and tag.name in ["div"]:
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=text,
content_layer=self.content_layer,
)
text = ""
def _walk(self, element: Tag, doc: DoclingDocument) -> None:
"""Parse an XML tag by recursively walking its content.
return
While walking, the method buffers inline text across tags like <b> or <span>,
emitting text nodes only at block boundaries.
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(tag, doc)
elif tag.name in ["p", "address", "summary"]:
self.handle_paragraph(tag, doc)
elif tag.name in ["pre", "code"]:
self.handle_code(tag, doc)
elif tag.name in ["ul", "ol"]:
self.handle_list(tag, doc)
elif tag.name in ["li"]:
self.handle_list_item(tag, doc)
elif tag.name == "table":
self.handle_table(tag, doc)
elif tag.name == "figure":
self.handle_figure(tag, doc)
elif tag.name == "img":
self.handle_image(tag, doc)
elif tag.name == "details":
self.handle_details(tag, doc)
else:
self.walk(tag, doc)
Args:
element: The XML tag to parse.
doc: The Docling document to be updated with the parsed content.
"""
buffer: list[str] = []
def get_text(self, item: PageElement) -> str:
"""Get the text content of a tag."""
parts: list[str] = self.extract_text_recursively(item)
return "".join(parts) + " "
# Function to recursively extract text from all child nodes
def extract_text_recursively(self, item: PageElement) -> list[str]:
result: list[str] = []
if isinstance(item, NavigableString):
return [item]
tag = cast(Tag, item)
if tag.name not in ["ul", "ol"]:
for child in tag:
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
return ["".join(result) + " "]
def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
"""Handle details tag (details) and its content."""
self.parents[self.level + 1] = doc.add_group(
name="details",
label=GroupLabel.SECTION,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
self.level += 1
self.walk(element, doc)
self.parents[self.level + 1] = None
self.level -= 1
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles header tags (h1, h2, etc.)."""
hlevel = int(element.name.replace("h", ""))
text = element.text.strip()
self.content_layer = ContentLayer.BODY
if hlevel == 1:
for key in self.parents.keys():
self.parents[key] = None
self.level = 1
self.parents[self.level] = doc.add_text(
parent=self.parents[0],
label=DocItemLabel.TITLE,
text=text,
content_layer=self.content_layer,
)
else:
if hlevel > self.level:
# add invisible group
for i in range(self.level + 1, hlevel):
self.parents[i] = doc.add_group(
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
def flush_buffer():
if not buffer:
return
text = "".join(buffer).strip()
buffer.clear()
if not text:
return
for part in text.split("\n"):
seg = part.strip()
if seg:
doc.add_text(
DocItemLabel.TEXT,
seg,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
self.level = hlevel
elif hlevel < self.level:
for node in element.contents:
if isinstance(node, Tag):
name = node.name.lower()
if name == "img":
flush_buffer()
self._emit_image(node, doc)
elif name in _BLOCK_TAGS:
flush_buffer()
self._handle_block(node, doc)
elif node.find(_BLOCK_TAGS):
flush_buffer()
self._walk(node, doc)
else:
buffer.append(node.text)
elif isinstance(node, NavigableString) and not isinstance(
node, PreformattedString
):
buffer.append(str(node))
flush_buffer()
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
tag_name = tag.name.lower()
# set default content layer to BODY as soon as we encounter a heading
self.content_layer = ContentLayer.BODY
level = int(tag_name[1])
text = tag.get_text(strip=True, separator=" ")
# the first level is for the title item
if level == 1:
for key in self.parents.keys():
self.parents[key] = None
self.level = 0
self.parents[self.level + 1] = doc.add_title(
text, content_layer=self.content_layer
)
# the other levels need to be lowered by 1 if a title was set
else:
level -= 1
if level > self.level:
# add invisible group
for i in range(self.level, level):
_log.debug(f"Adding invisible group to level {i}")
self.parents[i + 1] = doc.add_group(
name=f"header-{i + 1}",
label=GroupLabel.SECTION,
parent=self.parents[i],
content_layer=self.content_layer,
)
self.level = level
elif level < self.level:
# remove the tail
for key in self.parents.keys():
if key > hlevel:
if key > level + 1:
_log.debug(f"Remove the tail of level {key}")
self.parents[key] = None
self.level = hlevel
self.parents[hlevel] = doc.add_heading(
parent=self.parents[hlevel - 1],
text=text,
level=hlevel - 1,
content_layer=self.content_layer,
)
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles monospace code snippets (pre)."""
if element.text is None:
return
text = element.text.strip()
if text:
doc.add_code(
self.level = level
self.parents[self.level + 1] = doc.add_heading(
parent=self.parents[self.level],
text=text,
level=self.level,
content_layer=self.content_layer,
)
self.level += 1
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
self._emit_image(img_tag, doc)
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p) or equivalent ones."""
if element.text is None:
return
text = element.text.strip()
if text:
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=text,
content_layer=self.content_layer,
)
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items."""
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
tag_name = tag.name.lower()
start: Optional[int] = None
if is_ordered := element.name == "ol":
start_attr = element.get("start")
name: str = ""
is_ordered = tag_name == "ol"
if is_ordered:
start_attr = tag.get("start")
if isinstance(start_attr, str) and start_attr.isnumeric():
start = int(start_attr)
name = "ordered list" + (f" start {start}" if start is not None else "")
else:
name = "list"
# create a list group
# Create the list container
list_group = doc.add_list_group(
name=name,
parent=self.parents[self.level],
@@ -320,64 +270,152 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
if is_ordered and start is not None:
self.ctx.list_start_by_ref[list_group.self_ref] = start
self.level += 1
self.walk(element, doc)
# For each top-level <li> in this list
for li in tag.find_all({"li", "ul", "ol"}, recursive=False):
if not isinstance(li, Tag):
continue
# sub-list items should be indented under main list items, but temporarily
# addressing invalid HTML (docling-core/issues/357)
if li.name in {"ul", "ol"}:
self._handle_block(li, doc)
else:
# 1) determine the marker
if is_ordered and start is not None:
marker = f"{start + len(list_group.children)}."
else:
marker = ""
# 2) extract only the "direct" text from this <li>
parts: list[str] = []
for child in li.contents:
if isinstance(child, NavigableString) and not isinstance(
child, PreformattedString
):
parts.append(child)
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
text_part = child.get_text()
if text_part:
parts.append(text_part)
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
# 3) add the list item
if li_text:
self.parents[self.level + 1] = doc.add_list_item(
text=li_text,
enumerated=is_ordered,
marker=marker,
parent=list_group,
content_layer=self.content_layer,
)
# 4) recurse into any nested lists, attaching them to this <li> item
for sublist in li({"ul", "ol"}, recursive=False):
if isinstance(sublist, Tag):
self.level += 1
self._handle_block(sublist, doc)
self.parents[self.level + 1] = None
self.level -= 1
else:
for sublist in li({"ul", "ol"}, recursive=False):
if isinstance(sublist, Tag):
self._handle_block(sublist, doc)
# 5) extract any images under this <li>
for img_tag in li("img"):
if isinstance(img_tag, Tag):
self._emit_image(img_tag, doc)
self.parents[self.level + 1] = None
self.level -= 1
def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list item tags (li)."""
nested_list = element.find(["ul", "ol"])
def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
tag_name = tag.name.lower()
parent = self.parents[self.level]
if parent is None:
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
return
enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
marker = f"{start + len(parent.children)}."
else:
marker = ""
if tag_name == "figure":
img_tag = tag.find("img")
if isinstance(img_tag, Tag):
self._emit_image(img_tag, doc)
if nested_list:
# Text in list item can be hidden within hierarchy, hence
# we need to extract it recursively
text: str = self.get_text(element)
# Flatten text, remove break lines:
text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip()
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
self._handle_heading(tag, doc)
if len(text) > 0:
# create a list-item
self.parents[self.level + 1] = doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=parent,
elif tag_name in {"ul", "ol"}:
self._handle_list(tag, doc)
elif tag_name in {"p", "address", "summary"}:
for part in tag.text.split("\n"):
seg = part.strip()
if seg:
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=seg,
content_layer=self.content_layer,
)
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
self._emit_image(img_tag, doc)
elif tag_name == "table":
data = HTMLDocumentBackend.parse_table_data(tag)
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
self._emit_image(tag, doc)
if data is not None:
doc.add_table(
data=data,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
self.level += 1
self.walk(element, doc)
self.parents[self.level + 1] = None
self.level -= 1
else:
self.walk(element, doc)
elif element.text.strip():
text = element.text.strip()
elif tag_name in {"pre", "code"}:
# handle monospace code snippets (pre).
text = tag.get_text(strip=True)
if text:
doc.add_code(
parent=self.parents[self.level],
text=text,
content_layer=self.content_layer,
)
doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=parent,
elif tag_name == "details":
# handle details and its content.
self.parents[self.level + 1] = doc.add_group(
name="details",
label=GroupLabel.SECTION,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
else:
_log.debug(f"list-item has no text: {element}")
self.level += 1
self._walk(tag, doc)
self.parents[self.level + 1] = None
self.level -= 1
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
figure = img_tag.find_parent("figure")
caption: str = ""
if isinstance(figure, Tag):
caption_tag = figure.find("figcaption", recursive=False)
if isinstance(caption_tag, Tag):
caption = caption_tag.get_text()
if not caption:
caption = str(img_tag.get("alt", "")).strip()
caption_item: Optional[TextItem] = None
if caption:
caption_item = doc.add_text(
DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
)
doc.add_picture(
caption=caption_item,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
@staticmethod
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
@@ -502,84 +540,3 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
data.table_cells.append(table_cell)
return data
def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles table tags."""
table_data = HTMLDocumentBackend.parse_table_data(element)
if table_data is not None:
doc.add_table(
data=table_data,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from <ul> or <ol> with proper indentation."""
result = []
bullet_char = "*" # Default bullet character for unordered lists
if list_element.name == "ol": # For ordered lists, use numbers
for i, li in enumerate(list_element("li", recursive=False), 1):
if not isinstance(li, Tag):
continue
# Add numbering for ordered lists
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if isinstance(nested_list, Tag):
result.extend(self.get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points
for li in list_element("li", recursive=False):
if not isinstance(li, Tag):
continue
# Add bullet points for unordered lists
result.append(
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
)
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if isinstance(nested_list, Tag):
result.extend(self.get_list_text(nested_list, level + 1))
return result
def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles image tags (img)."""
# Extract the image URI from the <img> tag
# image_uri = root.xpath('//figure//img/@src')[0]
contains_captions = element.find(["figcaption"])
if not isinstance(contains_captions, Tag):
doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)
else:
texts = []
for item in contains_captions:
texts.append(item.text)
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION,
text=("".join(texts)).strip(),
content_layer=self.content_layer,
)
doc.add_picture(
parent=self.parents[self.level],
caption=fig_caption,
content_layer=self.content_layer,
)
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles image tags (img)."""
_log.debug(f"ignoring <img> tags at the moment: {element}")
doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)