A new HTML backend that handles styled html (ignors it) as well as images.

Note: MyPy fails.
Seems to be a known issue with BeautifulSoup:
https://github.com/python/typeshed/pull/13604

- Fixed issues with handling nested lists.
- Fixed some issues with spaces between text fragments
- Change naming of image configuration from INLINE to EMBEDDED. Also renamed corresponding class.
- Introduced constat for default image width- / height.

Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com>
This commit is contained in:
vaaale 2025-05-24 22:25:51 +02:00
parent 733360c7b2
commit 5d08b749af
2 changed files with 6022 additions and 162 deletions

View File

@ -7,6 +7,7 @@ from pathlib import Path
from typing import Optional, Union
import requests
from PIL import Image, UnidentifiedImageError
from bs4 import BeautifulSoup, NavigableString, Tag
from docling_core.types.doc import (
DocItemLabel,
@ -17,8 +18,7 @@ from docling_core.types.doc import (
TableData,
)
from docling_core.types.doc.document import ContentLayer, ImageRef
from PIL import Image, UnidentifiedImageError
from pydantic import AnyUrl, HttpUrl, ValidationError
from pydantic import AnyUrl, ValidationError
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
@ -27,15 +27,17 @@ from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
DEFAULT_IMAGE_WIDTH = 128
DEFAULT_IMAGE_HEIGHT = 128
# Tags that initiate distinct Docling items
_BLOCK_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "table"}
class ImageOptions(str, Enum):
"""Image options for HTML backend."""
NONE = "none"
INLINE = "inline"
EMBEDDED = "embedded"
REFERENCED = "referenced"
@ -49,7 +51,6 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
):
super().__init__(in_doc, path_or_stream)
self.image_options = image_options
self.soup: Optional[Tag] = None
try:
raw = (
path_or_stream.getvalue()
@ -88,35 +89,27 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
_log.debug("Starting HTML conversion...")
if not self.is_valid():
raise RuntimeError("Invalid HTML document.")
assert self.soup is not None
# Remove all script/style content
title = self.soup.find("title")
if title:
doc.add_title(title.get_text())
# remove scripts/styles
for tag in self.soup.find_all(["script", "style"]):
tag.decompose()
body = self.soup.body or self.soup
# Normalize <br> tags to newline strings
# normalize <br>
for br in body.find_all("br"):
br.replace_with(NavigableString("\n"))
# Decide content layer by presence of headers
headers = body.find(list(_BLOCK_TAGS))
self.content_layer = (
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
)
# Walk the body to build the DoclingDocument
self._walk(body, doc, parent=doc.body)
return doc
def _walk(self, element: Tag, doc: DoclingDocument, parent) -> None:
"""
Recursively walk element.contents, buffering inline text across tags like <b> or <span>,
emitting text nodes only at block boundaries, and extracting images immediately.
"""
buffer: list[str] = []
def flush_buffer():
@ -126,88 +119,93 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
buffer.clear()
if not text:
return
# Split on newlines for <br>
for part in text.split("\n"):
seg = part.strip()
if seg:
doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
for node in element.contents:
# Skip scripts/styles
if isinstance(node, Tag) and node.name.lower() in ("script", "style"):
continue
# Immediate image extraction
if isinstance(node, Tag) and node.name.lower() == "img":
flush_buffer()
self._emit_image(node, doc, parent)
continue
# Block-level element triggers flush + handle
if isinstance(node, Tag) and node.name.lower() in _BLOCK_TAGS:
flush_buffer()
self._handle_block(node, doc, parent)
# Inline tag with nested blocks: recurse
elif isinstance(node, Tag) and node.find(list(_BLOCK_TAGS)):
flush_buffer()
self._walk(node, doc, parent)
# Inline text
elif isinstance(node, Tag):
buffer.append(node.get_text())
elif isinstance(node, NavigableString):
buffer.append(str(node))
# Flush any remaining text
flush_buffer()
def _handle_block(self, tag: Tag, doc: DoclingDocument, parent) -> None:
tag_name = tag.name.lower()
if tag_name == "h1":
text = tag.get_text(strip=True)
if text:
doc.add_title(text, parent=parent)
for img_tag in tag.find_all("img", recursive=True):
self._emit_image(img_tag, doc, parent)
elif tag_name in {"h2", "h3", "h4", "h5", "h6"}:
if tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
level = int(tag_name[1])
text = tag.get_text(strip=True)
text = tag.get_text(strip=False)
if text:
doc.add_heading(text, level=level, parent=parent)
doc.add_heading(text.strip(), level=level, parent=parent)
for img_tag in tag.find_all("img", recursive=True):
self._emit_image(img_tag, doc, parent)
elif tag_name == "p":
for part in tag.get_text().split("\n"):
seg = part.strip()
if seg:
doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
for img_tag in tag.find_all("img", recursive=True):
self._emit_image(img_tag, doc, parent)
for img_tag in tag.find_all("img", recursive=True):
self._emit_image(img_tag, doc, parent)
elif tag_name in {"ul", "ol"}:
is_ordered = tag_name == "ol"
group = (
is_ordered = (tag_name == "ol")
# Create the list container
list_group = (
doc.add_ordered_list(parent=parent)
if is_ordered
else doc.add_unordered_list(parent=parent)
)
# For each top-level <li> in this list
for li in tag.find_all("li", recursive=False):
li_text = li.get_text(separator=" ", strip=True)
# 1) extract only the "direct" text from this <li>
parts: list[str] = []
for child in li.contents:
if isinstance(child, NavigableString):
text_part = child.strip()
if text_part:
parts.append(text_part)
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
text_part = child.get_text(separator=" ", strip=True)
if text_part:
parts.append(text_part)
li_text = " ".join(parts)
# 2) add the list item
li_item = doc.add_list_item(
text=li_text, enumerated=is_ordered, parent=group
text=li_text, enumerated=is_ordered, parent=list_group
)
# Nested lists inside <li>
for sub in li.find_all(["ul", "ol"], recursive=False):
self._handle_block(sub, doc, parent=group)
# 3) recurse into any nested lists, attaching them to this <li> item
for sublist in li.find_all(["ul", "ol"], recursive=False):
self._handle_block(sublist, doc, parent=li_item)
# 4) extract any images under this <li>
for img_tag in li.find_all("img", recursive=True):
self._emit_image(img_tag, doc, li_item)
elif tag_name == "table":
# Add table item and extract nested images
data = self._parse_table(tag, doc, parent)
doc.add_table(data=data, parent=parent)
def _emit_image(self, img_tag: Tag, doc: DoclingDocument, parent) -> None:
"""
Helper to create a PictureItem (with optional CAPTION) for an <img> tag.
"""
if ImageOptions.NONE == self.image_options:
if self.image_options == ImageOptions.NONE:
return
alt = (img_tag.get("alt") or "").strip()
@ -215,46 +213,40 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
if alt:
caption_item = doc.add_text(DocItemLabel.CAPTION, alt, parent=parent)
src_url = img_tag.get("src")
width = img_tag.get("width", "128")
height = img_tag.get("height", "128")
img_ref = None
if ImageOptions.INLINE == self.image_options:
src_url = img_tag.get("src", "")
width = img_tag.get("width", str(DEFAULT_IMAGE_WIDTH))
height = img_tag.get("height", str(DEFAULT_IMAGE_HEIGHT))
img_ref: Optional[ImageRef] = None
if self.image_options == ImageOptions.EMBEDDED:
try:
if src_url.startswith("http"):
img = Image.open(requests.get(src_url, stream=True).raw)
elif src_url.startswith("file:"):
img = Image.open(src_url)
elif src_url.startswith("data:"):
image_data = re.sub("^data:image/.+;base64,", "", src_url)
img = Image.open(BytesIO(base64.b64decode(image_data)))
data = re.sub(r"^data:image/.+;base64,", "", src_url)
img = Image.open(BytesIO(base64.b64decode(data)))
else:
return
img_ref = ImageRef.from_pil(img, dpi=int(img.info.get("dpi")[0]))
except (FileNotFoundError, UnidentifiedImageError) as ve:
_log.warning(f"Could not load image (src={src_url}): {ve}")
img_ref = ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
except (FileNotFoundError, UnidentifiedImageError) as e:
_log.warning(f"Could not load image (src={src_url}): {e}")
return
elif ImageOptions.REFERENCED == self.image_options:
elif self.image_options == ImageOptions.REFERENCED:
try:
img_url = AnyUrl(src_url)
img_ref = ImageRef(
uri=img_url,
uri=AnyUrl(src_url),
dpi=72,
mimetype="image/png",
size=Size(width=float(width), height=float(height)),
)
except ValidationError as ve:
_log.warning(f"Could not load image (src={src_url}): {ve}")
except ValidationError as e:
_log.warning(f"Could not load image (src={src_url}): {e}")
return
doc.add_picture(image=img_ref, caption=caption_item, parent=parent)
def _parse_table(self, table_tag: Tag, doc: DoclingDocument, parent) -> TableData:
"""
Convert an HTML table into TableData, capturing cell spans and text,
and emitting any nested images as PictureItems.
"""
# Build TableData
rows = []
for sec in ("thead", "tbody", "tfoot"):
section = table_tag.find(sec)
@ -262,9 +254,11 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
rows.extend(section.find_all("tr", recursive=False))
if not rows:
rows = table_tag.find_all("tr", recursive=False)
occupied: dict[tuple[int, int], bool] = {}
cells: list[TableCell] = []
max_cols = 0
for r, tr in enumerate(rows):
c = 0
for cell_tag in tr.find_all(("td", "th"), recursive=False):
@ -292,9 +286,11 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
occupied[(r + dr, c + dc)] = True
c += cs
max_cols = max(max_cols, c)
# Emit images inside this table
# emit any images in the table
for img_tag in table_tag.find_all("img", recursive=True):
self._emit_image(img_tag, doc, parent)
return TableData(table_cells=cells, num_rows=len(rows), num_cols=max_cols)
@ -308,14 +304,14 @@ class HTMLDocumentBackend(BaseHTMLDocumentBackend):
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.NONE)
class HTMLDocumentBackendImagesInline(BaseHTMLDocumentBackend):
class HTMLDocumentBackendImagesEmbedded(BaseHTMLDocumentBackend):
@override
def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
):
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.INLINE)
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.EMBEDDED)
class HTMLDocumentBackendImagesReferenced(BaseHTMLDocumentBackend):

File diff suppressed because one or more lines are too long