A new HTML backend that handles styled html (ignors it) as well as images.

Note: MyPy fails.
Seems to be a known issue with BeautifulSoup:
https://github.com/python/typeshed/pull/13604

- Fixed issues with handling nested lists.
- Fixed some issues with spaces between text fragments
- Change naming of image configuration from INLINE to EMBEDDED. Also renamed corresponding class.
- Introduced constat for default image width- / height.

Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com>
This commit is contained in:
vaaale 2025-05-24 22:25:51 +02:00
parent 733360c7b2
commit 5d08b749af
2 changed files with 6022 additions and 162 deletions

View File

@ -7,6 +7,7 @@ from pathlib import Path
from typing import Optional, Union from typing import Optional, Union
import requests import requests
from PIL import Image, UnidentifiedImageError
from bs4 import BeautifulSoup, NavigableString, Tag from bs4 import BeautifulSoup, NavigableString, Tag
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
@ -17,8 +18,7 @@ from docling_core.types.doc import (
TableData, TableData,
) )
from docling_core.types.doc.document import ContentLayer, ImageRef from docling_core.types.doc.document import ContentLayer, ImageRef
from PIL import Image, UnidentifiedImageError from pydantic import AnyUrl, ValidationError
from pydantic import AnyUrl, HttpUrl, ValidationError
from typing_extensions import override from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
@ -27,15 +27,17 @@ from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
DEFAULT_IMAGE_WIDTH = 128
DEFAULT_IMAGE_HEIGHT = 128
# Tags that initiate distinct Docling items # Tags that initiate distinct Docling items
_BLOCK_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "table"} _BLOCK_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "table"}
class ImageOptions(str, Enum): class ImageOptions(str, Enum):
"""Image options for HTML backend.""" """Image options for HTML backend."""
NONE = "none" NONE = "none"
INLINE = "inline" EMBEDDED = "embedded"
REFERENCED = "referenced" REFERENCED = "referenced"
@ -49,7 +51,6 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
): ):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
self.image_options = image_options self.image_options = image_options
self.soup: Optional[Tag] = None
try: try:
raw = ( raw = (
path_or_stream.getvalue() path_or_stream.getvalue()
@ -88,35 +89,27 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
binary_hash=self.document_hash, binary_hash=self.document_hash,
) )
doc = DoclingDocument(name=self.file.stem or "file", origin=origin) doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
_log.debug("Starting HTML conversion...") title = self.soup.find("title")
if not self.is_valid(): if title:
raise RuntimeError("Invalid HTML document.") doc.add_title(title.get_text())
assert self.soup is not None # remove scripts/styles
# Remove all script/style content
for tag in self.soup.find_all(["script", "style"]): for tag in self.soup.find_all(["script", "style"]):
tag.decompose() tag.decompose()
body = self.soup.body or self.soup body = self.soup.body or self.soup
# Normalize <br> tags to newline strings # normalize <br>
for br in body.find_all("br"): for br in body.find_all("br"):
br.replace_with(NavigableString("\n")) br.replace_with(NavigableString("\n"))
# Decide content layer by presence of headers
headers = body.find(list(_BLOCK_TAGS)) headers = body.find(list(_BLOCK_TAGS))
self.content_layer = ( self.content_layer = (
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
) )
# Walk the body to build the DoclingDocument
self._walk(body, doc, parent=doc.body) self._walk(body, doc, parent=doc.body)
return doc return doc
def _walk(self, element: Tag, doc: DoclingDocument, parent) -> None: def _walk(self, element: Tag, doc: DoclingDocument, parent) -> None:
"""
Recursively walk element.contents, buffering inline text across tags like <b> or <span>,
emitting text nodes only at block boundaries, and extracting images immediately.
"""
buffer: list[str] = [] buffer: list[str] = []
def flush_buffer(): def flush_buffer():
@ -126,88 +119,93 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
buffer.clear() buffer.clear()
if not text: if not text:
return return
# Split on newlines for <br>
for part in text.split("\n"): for part in text.split("\n"):
seg = part.strip() seg = part.strip()
if seg: if seg:
doc.add_text(DocItemLabel.TEXT, seg, parent=parent) doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
for node in element.contents: for node in element.contents:
# Skip scripts/styles
if isinstance(node, Tag) and node.name.lower() in ("script", "style"): if isinstance(node, Tag) and node.name.lower() in ("script", "style"):
continue continue
# Immediate image extraction
if isinstance(node, Tag) and node.name.lower() == "img": if isinstance(node, Tag) and node.name.lower() == "img":
flush_buffer() flush_buffer()
self._emit_image(node, doc, parent) self._emit_image(node, doc, parent)
continue continue
# Block-level element triggers flush + handle
if isinstance(node, Tag) and node.name.lower() in _BLOCK_TAGS: if isinstance(node, Tag) and node.name.lower() in _BLOCK_TAGS:
flush_buffer() flush_buffer()
self._handle_block(node, doc, parent) self._handle_block(node, doc, parent)
# Inline tag with nested blocks: recurse
elif isinstance(node, Tag) and node.find(list(_BLOCK_TAGS)): elif isinstance(node, Tag) and node.find(list(_BLOCK_TAGS)):
flush_buffer() flush_buffer()
self._walk(node, doc, parent) self._walk(node, doc, parent)
# Inline text
elif isinstance(node, Tag): elif isinstance(node, Tag):
buffer.append(node.get_text()) buffer.append(node.get_text())
elif isinstance(node, NavigableString): elif isinstance(node, NavigableString):
buffer.append(str(node)) buffer.append(str(node))
# Flush any remaining text
flush_buffer() flush_buffer()
def _handle_block(self, tag: Tag, doc: DoclingDocument, parent) -> None: def _handle_block(self, tag: Tag, doc: DoclingDocument, parent) -> None:
tag_name = tag.name.lower() tag_name = tag.name.lower()
if tag_name == "h1":
text = tag.get_text(strip=True) if tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
if text:
doc.add_title(text, parent=parent)
for img_tag in tag.find_all("img", recursive=True):
self._emit_image(img_tag, doc, parent)
elif tag_name in {"h2", "h3", "h4", "h5", "h6"}:
level = int(tag_name[1]) level = int(tag_name[1])
text = tag.get_text(strip=True) text = tag.get_text(strip=False)
if text: if text:
doc.add_heading(text, level=level, parent=parent) doc.add_heading(text.strip(), level=level, parent=parent)
for img_tag in tag.find_all("img", recursive=True): for img_tag in tag.find_all("img", recursive=True):
self._emit_image(img_tag, doc, parent) self._emit_image(img_tag, doc, parent)
elif tag_name == "p": elif tag_name == "p":
for part in tag.get_text().split("\n"): for part in tag.get_text().split("\n"):
seg = part.strip() seg = part.strip()
if seg: if seg:
doc.add_text(DocItemLabel.TEXT, seg, parent=parent) doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
for img_tag in tag.find_all("img", recursive=True): for img_tag in tag.find_all("img", recursive=True):
self._emit_image(img_tag, doc, parent) self._emit_image(img_tag, doc, parent)
elif tag_name in {"ul", "ol"}: elif tag_name in {"ul", "ol"}:
is_ordered = tag_name == "ol" is_ordered = (tag_name == "ol")
group = ( # Create the list container
list_group = (
doc.add_ordered_list(parent=parent) doc.add_ordered_list(parent=parent)
if is_ordered if is_ordered
else doc.add_unordered_list(parent=parent) else doc.add_unordered_list(parent=parent)
) )
# For each top-level <li> in this list
for li in tag.find_all("li", recursive=False): for li in tag.find_all("li", recursive=False):
li_text = li.get_text(separator=" ", strip=True) # 1) extract only the "direct" text from this <li>
parts: list[str] = []
for child in li.contents:
if isinstance(child, NavigableString):
text_part = child.strip()
if text_part:
parts.append(text_part)
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
text_part = child.get_text(separator=" ", strip=True)
if text_part:
parts.append(text_part)
li_text = " ".join(parts)
# 2) add the list item
li_item = doc.add_list_item( li_item = doc.add_list_item(
text=li_text, enumerated=is_ordered, parent=group text=li_text, enumerated=is_ordered, parent=list_group
) )
# Nested lists inside <li>
for sub in li.find_all(["ul", "ol"], recursive=False): # 3) recurse into any nested lists, attaching them to this <li> item
self._handle_block(sub, doc, parent=group) for sublist in li.find_all(["ul", "ol"], recursive=False):
self._handle_block(sublist, doc, parent=li_item)
# 4) extract any images under this <li>
for img_tag in li.find_all("img", recursive=True): for img_tag in li.find_all("img", recursive=True):
self._emit_image(img_tag, doc, li_item) self._emit_image(img_tag, doc, li_item)
elif tag_name == "table": elif tag_name == "table":
# Add table item and extract nested images
data = self._parse_table(tag, doc, parent) data = self._parse_table(tag, doc, parent)
doc.add_table(data=data, parent=parent) doc.add_table(data=data, parent=parent)
def _emit_image(self, img_tag: Tag, doc: DoclingDocument, parent) -> None: def _emit_image(self, img_tag: Tag, doc: DoclingDocument, parent) -> None:
""" if self.image_options == ImageOptions.NONE:
Helper to create a PictureItem (with optional CAPTION) for an <img> tag.
"""
if ImageOptions.NONE == self.image_options:
return return
alt = (img_tag.get("alt") or "").strip() alt = (img_tag.get("alt") or "").strip()
@ -215,46 +213,40 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
if alt: if alt:
caption_item = doc.add_text(DocItemLabel.CAPTION, alt, parent=parent) caption_item = doc.add_text(DocItemLabel.CAPTION, alt, parent=parent)
src_url = img_tag.get("src") src_url = img_tag.get("src", "")
width = img_tag.get("width", "128") width = img_tag.get("width", str(DEFAULT_IMAGE_WIDTH))
height = img_tag.get("height", "128") height = img_tag.get("height", str(DEFAULT_IMAGE_HEIGHT))
img_ref = None img_ref: Optional[ImageRef] = None
if ImageOptions.INLINE == self.image_options:
if self.image_options == ImageOptions.EMBEDDED:
try: try:
if src_url.startswith("http"): if src_url.startswith("http"):
img = Image.open(requests.get(src_url, stream=True).raw) img = Image.open(requests.get(src_url, stream=True).raw)
elif src_url.startswith("file:"):
img = Image.open(src_url)
elif src_url.startswith("data:"): elif src_url.startswith("data:"):
image_data = re.sub("^data:image/.+;base64,", "", src_url) data = re.sub(r"^data:image/.+;base64,", "", src_url)
img = Image.open(BytesIO(base64.b64decode(image_data))) img = Image.open(BytesIO(base64.b64decode(data)))
else: else:
return return
img_ref = ImageRef.from_pil(img, dpi=int(img.info.get("dpi")[0])) img_ref = ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
except (FileNotFoundError, UnidentifiedImageError) as ve: except (FileNotFoundError, UnidentifiedImageError) as e:
_log.warning(f"Could not load image (src={src_url}): {ve}") _log.warning(f"Could not load image (src={src_url}): {e}")
return return
elif ImageOptions.REFERENCED == self.image_options:
elif self.image_options == ImageOptions.REFERENCED:
try: try:
img_url = AnyUrl(src_url)
img_ref = ImageRef( img_ref = ImageRef(
uri=img_url, uri=AnyUrl(src_url),
dpi=72, dpi=72,
mimetype="image/png", mimetype="image/png",
size=Size(width=float(width), height=float(height)), size=Size(width=float(width), height=float(height)),
) )
except ValidationError as ve: except ValidationError as e:
_log.warning(f"Could not load image (src={src_url}): {ve}") _log.warning(f"Could not load image (src={src_url}): {e}")
return return
doc.add_picture(image=img_ref, caption=caption_item, parent=parent) doc.add_picture(image=img_ref, caption=caption_item, parent=parent)
def _parse_table(self, table_tag: Tag, doc: DoclingDocument, parent) -> TableData: def _parse_table(self, table_tag: Tag, doc: DoclingDocument, parent) -> TableData:
"""
Convert an HTML table into TableData, capturing cell spans and text,
and emitting any nested images as PictureItems.
"""
# Build TableData
rows = [] rows = []
for sec in ("thead", "tbody", "tfoot"): for sec in ("thead", "tbody", "tfoot"):
section = table_tag.find(sec) section = table_tag.find(sec)
@ -262,9 +254,11 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
rows.extend(section.find_all("tr", recursive=False)) rows.extend(section.find_all("tr", recursive=False))
if not rows: if not rows:
rows = table_tag.find_all("tr", recursive=False) rows = table_tag.find_all("tr", recursive=False)
occupied: dict[tuple[int, int], bool] = {} occupied: dict[tuple[int, int], bool] = {}
cells: list[TableCell] = [] cells: list[TableCell] = []
max_cols = 0 max_cols = 0
for r, tr in enumerate(rows): for r, tr in enumerate(rows):
c = 0 c = 0
for cell_tag in tr.find_all(("td", "th"), recursive=False): for cell_tag in tr.find_all(("td", "th"), recursive=False):
@ -292,9 +286,11 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
occupied[(r + dr, c + dc)] = True occupied[(r + dr, c + dc)] = True
c += cs c += cs
max_cols = max(max_cols, c) max_cols = max(max_cols, c)
# Emit images inside this table
# emit any images in the table
for img_tag in table_tag.find_all("img", recursive=True): for img_tag in table_tag.find_all("img", recursive=True):
self._emit_image(img_tag, doc, parent) self._emit_image(img_tag, doc, parent)
return TableData(table_cells=cells, num_rows=len(rows), num_cols=max_cols) return TableData(table_cells=cells, num_rows=len(rows), num_cols=max_cols)
@ -308,14 +304,14 @@ class HTMLDocumentBackend(BaseHTMLDocumentBackend):
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.NONE) super().__init__(in_doc, path_or_stream, image_options=ImageOptions.NONE)
class HTMLDocumentBackendImagesInline(BaseHTMLDocumentBackend): class HTMLDocumentBackendImagesEmbedded(BaseHTMLDocumentBackend):
@override @override
def __init__( def __init__(
self, self,
in_doc: InputDocument, in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path], path_or_stream: Union[BytesIO, Path],
): ):
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.INLINE) super().__init__(in_doc, path_or_stream, image_options=ImageOptions.EMBEDDED)
class HTMLDocumentBackendImagesReferenced(BaseHTMLDocumentBackend): class HTMLDocumentBackendImagesReferenced(BaseHTMLDocumentBackend):

File diff suppressed because one or more lines are too long