refactor: add the contentlayer to html-backend (#1040)

* added the contentlayer to html-backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated the handle_image function Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted code of html backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * test(html): add more info if a test case fails Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * refactor(html): put parsed item in body if doc has no header In case an HTML does not have any header tag, all parsed items are placed in DoclingDocument's body content layer. HTML paragraphs ('p' tags) are parsed as text items with paragraph label. Update test ground truth accoring to the changes above. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: set TextItem label to 'text' instead of 'paragraph' Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-12-08 20:58:11 +00:00 · 2025-03-02 10:37:53 -05:00
parent db3ceefd4a
commit e25d557c06
13 changed files with 623 additions and 709 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -15,6 +15,7 @@ from docling_core.types.doc import (
    TableCell,
    TableData,
 )
+from docling_core.types.doc.document import ContentLayer
 from typing_extensions import override

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -66,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    self.soup = BeautifulSoup(html_content, "html.parser")
        except Exception as e:
            raise RuntimeError(
-                f"Could not initialize HTML backend for file with hash {self.document_hash}."
+                "Could not initialize HTML backend for file with "
+                f"hash {self.document_hash}."
            ) from e

    @override
@@ -109,14 +111,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            # TODO: remove style to avoid losing text from tags like i, b, span, ...
            for br in content("br"):
                br.replace_with(NavigableString("\n"))
+
+            headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
+            self.content_layer = (
+                ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
+            )
            self.walk(content, doc)
        else:
            raise RuntimeError(
-                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+                f"Cannot convert doc with {self.document_hash} because the backend "
+                "failed to init."
            )
        return doc

    def walk(self, tag: Tag, doc: DoclingDocument) -> None:
+
        # Iterate over elements in the body of the document
        text: str = ""
        for element in tag.children:
@@ -143,8 +152,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    if text and tag.name in ["div"]:
                        doc.add_text(
                            parent=self.parents[self.level],
-                            label=DocItemLabel.PARAGRAPH,
+                            label=DocItemLabel.TEXT,
                            text=text,
+                            content_layer=self.content_layer,
                        )
                    text = ""

@@ -166,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        elif tag.name == "figure":
            self.handle_figure(tag, doc)
        elif tag.name == "img":
-            self.handle_image(doc)
+            self.handle_image(tag, doc)
        else:
            self.walk(tag, doc)

@@ -197,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        text = element.text.strip()

        if hlevel == 1:
+            self.content_layer = ContentLayer.BODY
+
            for key in self.parents.keys():
                self.parents[key] = None

            self.level = 1
            self.parents[self.level] = doc.add_text(
-                parent=self.parents[0], label=DocItemLabel.TITLE, text=text
+                parent=self.parents[0],
+                label=DocItemLabel.TITLE,
+                text=text,
+                content_layer=self.content_layer,
            )
        else:
            if hlevel > self.level:
@@ -213,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                        name=f"header-{i}",
                        label=GroupLabel.SECTION,
                        parent=self.parents[i - 1],
+                        content_layer=self.content_layer,
                    )
                self.level = hlevel

@@ -228,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                parent=self.parents[hlevel - 1],
                text=text,
                level=hlevel,
+                content_layer=self.content_layer,
            )

    def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
@@ -236,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            return
        text = element.text.strip()
        if text:
-            doc.add_code(parent=self.parents[self.level], text=text)
+            doc.add_code(
+                parent=self.parents[self.level],
+                text=text,
+                content_layer=self.content_layer,
+            )

    def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles paragraph tags (p)."""
        if element.text is None:
            return
        text = element.text.strip()
-        label = DocItemLabel.PARAGRAPH
        if text:
-            doc.add_text(parent=self.parents[self.level], label=label, text=text)
+            doc.add_text(
+                parent=self.parents[self.level],
+                label=DocItemLabel.TEXT,
+                text=text,
+                content_layer=self.content_layer,
+            )

    def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles list tags (ul, ol) and their list items."""
@@ -253,7 +278,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        if element.name == "ul":
            # create a list group
            self.parents[self.level + 1] = doc.add_group(
-                parent=self.parents[self.level], name="list", label=GroupLabel.LIST
+                parent=self.parents[self.level],
+                name="list",
+                label=GroupLabel.LIST,
+                content_layer=self.content_layer,
            )
        elif element.name == "ol":
            start_attr = element.get("start")
@@ -267,6 +295,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                parent=self.parents[self.level],
                name="ordered list" + (f" start {start}" if start != 1 else ""),
                label=GroupLabel.ORDERED_LIST,
+                content_layer=self.content_layer,
            )
        self.level += 1

@@ -315,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    enumerated=enumerated,
                    marker=marker,
                    parent=parent,
+                    content_layer=self.content_layer,
                )
                self.level += 1

@@ -336,6 +366,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                enumerated=enumerated,
                marker=marker,
                parent=parent,
+                content_layer=self.content_layer,
            )
        else:
            _log.debug(f"list-item has no text: {element}")
@@ -439,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        table_data = HTMLDocumentBackend.parse_table_data(element)

        if table_data is not None:
-            doc.add_table(data=table_data, parent=self.parents[self.level])
+            doc.add_table(
+                data=table_data,
+                parent=self.parents[self.level],
+                content_layer=self.content_layer,
+            )

    def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
        """Recursively extract text from <ul> or <ol> with proper indentation."""
@@ -479,20 +514,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

        contains_captions = element.find(["figcaption"])
        if not isinstance(contains_captions, Tag):
-            doc.add_picture(parent=self.parents[self.level], caption=None)
+            doc.add_picture(
+                parent=self.parents[self.level],
+                caption=None,
+                content_layer=self.content_layer,
+            )
        else:
            texts = []
            for item in contains_captions:
                texts.append(item.text)

            fig_caption = doc.add_text(
-                label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
+                label=DocItemLabel.CAPTION,
+                text=("".join(texts)).strip(),
+                content_layer=self.content_layer,
            )
            doc.add_picture(
                parent=self.parents[self.level],
                caption=fig_caption,
+                content_layer=self.content_layer,
            )

-    def handle_image(self, doc: DoclingDocument) -> None:
+    def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles image tags (img)."""
-        doc.add_picture(parent=self.parents[self.level], caption=None)
+        _log.debug(f"ignoring <img> tags at the moment: {element}")
+
+        doc.add_picture(
+            parent=self.parents[self.level],
+            caption=None,
+            content_layer=self.content_layer,
+        )