fix(HTML): parse footer tag as a group in furniture content layer (#2106)

* fix(HTML): parse footer tag as a section in furniture Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): add test for body vs furniture in HTML parser. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-10 13:48:13 +00:00 · 2025-08-20 08:42:25 +02:00
parent 8820b5558b
commit c5f2e2fdd6
5 changed files with 131 additions and 131 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -38,6 +38,7 @@ _BLOCK_TAGS: Final = {
    "address",
    "details",
    "figure",
+    "footer",
    "h1",
    "h2",
    "h3",
@@ -639,10 +640,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                        hyperlink=annotated_text.hyperlink,
                    )

-        elif tag_name == "details":
-            # handle details and its content.
+        elif tag_name in {"details", "footer"}:
+            if tag_name == "footer":
+                current_layer = self.content_layer
+                self.content_layer = ContentLayer.FURNITURE
            self.parents[self.level + 1] = doc.add_group(
-                name="details",
+                name=tag_name,
                label=GroupLabel.SECTION,
                parent=self.parents[self.level],
                content_layer=self.content_layer,
@@ -651,6 +654,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            self._walk(tag, doc)
            self.parents[self.level + 1] = None
            self.level -= 1
+            if tag_name == "footer":
+                self.content_layer = current_layer

    def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
        figure = img_tag.find_parent("figure")
@@ -686,7 +691,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            text_clean = HTMLDocumentBackend._clean_unicode(
                caption_anno_text.text.strip()
            )
-            print(caption_anno_text)
            caption_item = doc.add_text(
                label=DocItemLabel.CAPTION,
                text=text_clean,