A new HTML backend that handles styled html (ignors it) as well as images.

Note: MyPy fails. Seems to be a known issue with BeautifulSoup: https://github.com/python/typeshed/pull/13604 - Fixed issues with handling nested lists. - Fixed some issues with spaces between text fragments - Change naming of image configuration from INLINE to EMBEDDED. Also renamed corresponding class. - Introduced constat for default image width- / height. Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com>
2025-07-26 20:14:47 +00:00 · 2025-05-24 22:25:51 +02:00 · 2025-05-24 22:25:51 +02:00 · 5d08b749af
commit 5d08b749af
parent 733360c7b2
2 changed files with 6022 additions and 162 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -7,6 +7,7 @@ from pathlib import Path
 from typing import Optional, Union
 import requests
 from PIL import Image, UnidentifiedImageError
 from bs4 import BeautifulSoup, NavigableString, Tag
 from docling_core.types.doc import (
    DocItemLabel,
@ -17,8 +18,7 @@ from docling_core.types.doc import (
    TableData,
 )
 from docling_core.types.doc.document import ContentLayer, ImageRef
-from PIL import Image, UnidentifiedImageError
+from pydantic import AnyUrl, ValidationError
 from pydantic import AnyUrl, HttpUrl, ValidationError
 from typing_extensions import override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
@ -27,15 +27,17 @@ from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
 DEFAULT_IMAGE_WIDTH = 128
 DEFAULT_IMAGE_HEIGHT = 128
 # Tags that initiate distinct Docling items
 _BLOCK_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "table"}
 class ImageOptions(str, Enum):
    """Image options for HTML backend."""
    NONE = "none"
-    INLINE = "inline"
+    EMBEDDED = "embedded"
    REFERENCED = "referenced"
@ -49,7 +51,6 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
    ):
        super().__init__(in_doc, path_or_stream)
        self.image_options = image_options
        self.soup: Optional[Tag] = None
        try:
            raw = (
                path_or_stream.getvalue()
@ -88,35 +89,27 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
            binary_hash=self.document_hash,
        )
        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
-        _log.debug("Starting HTML conversion...")
+        title = self.soup.find("title")
-        if not self.is_valid():
+        if title:
-            raise RuntimeError("Invalid HTML document.")
+            doc.add_title(title.get_text())
-        assert self.soup is not None
+        # remove scripts/styles
        # Remove all script/style content
        for tag in self.soup.find_all(["script", "style"]):
            tag.decompose()
        body = self.soup.body or self.soup
-        # Normalize <br> tags to newline strings
+        # normalize <br>
        for br in body.find_all("br"):
            br.replace_with(NavigableString("\n"))
        # Decide content layer by presence of headers
        headers = body.find(list(_BLOCK_TAGS))
        self.content_layer = (
            ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
        )
        # Walk the body to build the DoclingDocument
        self._walk(body, doc, parent=doc.body)
        return doc
    def _walk(self, element: Tag, doc: DoclingDocument, parent) -> None:
        """
        Recursively walk element.contents, buffering inline text across tags like <b> or <span>,
        emitting text nodes only at block boundaries, and extracting images immediately.
        """
        buffer: list[str] = []
        def flush_buffer():
@ -126,88 +119,93 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
            buffer.clear()
            if not text:
                return
            # Split on newlines for <br>
            for part in text.split("\n"):
                seg = part.strip()
                if seg:
                    doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
        for node in element.contents:
            # Skip scripts/styles
            if isinstance(node, Tag) and node.name.lower() in ("script", "style"):
                continue
            # Immediate image extraction
            if isinstance(node, Tag) and node.name.lower() == "img":
                flush_buffer()
                self._emit_image(node, doc, parent)
                continue
            # Block-level element triggers flush + handle
            if isinstance(node, Tag) and node.name.lower() in _BLOCK_TAGS:
                flush_buffer()
                self._handle_block(node, doc, parent)
            # Inline tag with nested blocks: recurse
            elif isinstance(node, Tag) and node.find(list(_BLOCK_TAGS)):
                flush_buffer()
                self._walk(node, doc, parent)
            # Inline text
            elif isinstance(node, Tag):
                buffer.append(node.get_text())
            elif isinstance(node, NavigableString):
                buffer.append(str(node))
        # Flush any remaining text
        flush_buffer()
    def _handle_block(self, tag: Tag, doc: DoclingDocument, parent) -> None:
        tag_name = tag.name.lower()
-        if tag_name == "h1":
+
-            text = tag.get_text(strip=True)
+        if tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
            if text:
                doc.add_title(text, parent=parent)
            for img_tag in tag.find_all("img", recursive=True):
                self._emit_image(img_tag, doc, parent)
        elif tag_name in {"h2", "h3", "h4", "h5", "h6"}:
            level = int(tag_name[1])
-            text = tag.get_text(strip=True)
+            text = tag.get_text(strip=False)
            if text:
-                doc.add_heading(text, level=level, parent=parent)
+                doc.add_heading(text.strip(), level=level, parent=parent)
            for img_tag in tag.find_all("img", recursive=True):
                self._emit_image(img_tag, doc, parent)
        elif tag_name == "p":
            for part in tag.get_text().split("\n"):
                seg = part.strip()
                if seg:
                    doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
-                for img_tag in tag.find_all("img", recursive=True):
+            for img_tag in tag.find_all("img", recursive=True):
-                    self._emit_image(img_tag, doc, parent)
+                self._emit_image(img_tag, doc, parent)
        elif tag_name in {"ul", "ol"}:
-            is_ordered = tag_name == "ol"
+            is_ordered = (tag_name == "ol")
-            group = (
+            # Create the list container
            list_group = (
                doc.add_ordered_list(parent=parent)
                if is_ordered
                else doc.add_unordered_list(parent=parent)
            )
            # For each top-level <li> in this list
            for li in tag.find_all("li", recursive=False):
-                li_text = li.get_text(separator=" ", strip=True)
+                # 1) extract only the "direct" text from this <li>
                parts: list[str] = []
                for child in li.contents:
                    if isinstance(child, NavigableString):
                        text_part = child.strip()
                        if text_part:
                            parts.append(text_part)
                    elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
                        text_part = child.get_text(separator=" ", strip=True)
                        if text_part:
                            parts.append(text_part)
                li_text = " ".join(parts)
                # 2) add the list item
                li_item = doc.add_list_item(
-                    text=li_text, enumerated=is_ordered, parent=group
+                    text=li_text, enumerated=is_ordered, parent=list_group
                )
-                # Nested lists inside <li>
+
-                for sub in li.find_all(["ul", "ol"], recursive=False):
+                # 3) recurse into any nested lists, attaching them to this <li> item
-                    self._handle_block(sub, doc, parent=group)
+                for sublist in li.find_all(["ul", "ol"], recursive=False):
                    self._handle_block(sublist, doc, parent=li_item)
                # 4) extract any images under this <li>
                for img_tag in li.find_all("img", recursive=True):
                    self._emit_image(img_tag, doc, li_item)
        elif tag_name == "table":
            # Add table item and extract nested images
            data = self._parse_table(tag, doc, parent)
            doc.add_table(data=data, parent=parent)
    def _emit_image(self, img_tag: Tag, doc: DoclingDocument, parent) -> None:
-        """
+        if self.image_options == ImageOptions.NONE:
        Helper to create a PictureItem (with optional CAPTION) for an <img> tag.
        """
        if ImageOptions.NONE == self.image_options:
            return
        alt = (img_tag.get("alt") or "").strip()
@ -215,46 +213,40 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
        if alt:
            caption_item = doc.add_text(DocItemLabel.CAPTION, alt, parent=parent)
-        src_url = img_tag.get("src")
+        src_url = img_tag.get("src", "")
-        width = img_tag.get("width", "128")
+        width = img_tag.get("width", str(DEFAULT_IMAGE_WIDTH))
-        height = img_tag.get("height", "128")
+        height = img_tag.get("height", str(DEFAULT_IMAGE_HEIGHT))
-        img_ref = None
+        img_ref: Optional[ImageRef] = None
-        if ImageOptions.INLINE == self.image_options:
+
        if self.image_options == ImageOptions.EMBEDDED:
            try:
                if src_url.startswith("http"):
                    img = Image.open(requests.get(src_url, stream=True).raw)
                elif src_url.startswith("file:"):
                    img = Image.open(src_url)
                elif src_url.startswith("data:"):
-                    image_data = re.sub("^data:image/.+;base64,", "", src_url)
+                    data = re.sub(r"^data:image/.+;base64,", "", src_url)
-                    img = Image.open(BytesIO(base64.b64decode(image_data)))
+                    img = Image.open(BytesIO(base64.b64decode(data)))
                else:
                    return
-                img_ref = ImageRef.from_pil(img, dpi=int(img.info.get("dpi")[0]))
+                img_ref = ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
-            except (FileNotFoundError, UnidentifiedImageError) as ve:
+            except (FileNotFoundError, UnidentifiedImageError) as e:
-                _log.warning(f"Could not load image (src={src_url}): {ve}")
+                _log.warning(f"Could not load image (src={src_url}): {e}")
                return
-        elif ImageOptions.REFERENCED == self.image_options:
+
        elif self.image_options == ImageOptions.REFERENCED:
            try:
                img_url = AnyUrl(src_url)
                img_ref = ImageRef(
-                    uri=img_url,
+                    uri=AnyUrl(src_url),
                    dpi=72,
                    mimetype="image/png",
                    size=Size(width=float(width), height=float(height)),
                )
-            except ValidationError as ve:
+            except ValidationError as e:
-                _log.warning(f"Could not load image (src={src_url}): {ve}")
+                _log.warning(f"Could not load image (src={src_url}): {e}")
                return
        doc.add_picture(image=img_ref, caption=caption_item, parent=parent)
    def _parse_table(self, table_tag: Tag, doc: DoclingDocument, parent) -> TableData:
        """
        Convert an HTML table into TableData, capturing cell spans and text,
        and emitting any nested images as PictureItems.
        """
        # Build TableData
        rows = []
        for sec in ("thead", "tbody", "tfoot"):
            section = table_tag.find(sec)
@ -262,9 +254,11 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
                rows.extend(section.find_all("tr", recursive=False))
        if not rows:
            rows = table_tag.find_all("tr", recursive=False)
        occupied: dict[tuple[int, int], bool] = {}
        cells: list[TableCell] = []
        max_cols = 0
        for r, tr in enumerate(rows):
            c = 0
            for cell_tag in tr.find_all(("td", "th"), recursive=False):
@ -292,9 +286,11 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
                        occupied[(r + dr, c + dc)] = True
                c += cs
            max_cols = max(max_cols, c)
-        # Emit images inside this table
+
        # emit any images in the table
        for img_tag in table_tag.find_all("img", recursive=True):
            self._emit_image(img_tag, doc, parent)
        return TableData(table_cells=cells, num_rows=len(rows), num_cols=max_cols)
@ -308,14 +304,14 @@ class HTMLDocumentBackend(BaseHTMLDocumentBackend):
        super().__init__(in_doc, path_or_stream, image_options=ImageOptions.NONE)
-class HTMLDocumentBackendImagesInline(BaseHTMLDocumentBackend):
+class HTMLDocumentBackendImagesEmbedded(BaseHTMLDocumentBackend):
    @override
    def __init__(
        self,
        in_doc: InputDocument,
        path_or_stream: Union[BytesIO, Path],
    ):
-        super().__init__(in_doc, path_or_stream, image_options=ImageOptions.INLINE)
+        super().__init__(in_doc, path_or_stream, image_options=ImageOptions.EMBEDDED)
 class HTMLDocumentBackendImagesReferenced(BaseHTMLDocumentBackend):
--- a/docs/examples/backend_html.ipynb
+++ b/docs/examples/backend_html.ipynb