diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index f2320693..e198752f 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -5,6 +5,8 @@ from typing import Final, Optional, Union, cast
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from bs4.element import PreformattedString
+
+from docling_core.types.doc.document import ContentLayer
from docling_core.types.doc import (
DocItem,
DocItemLabel,
@@ -103,6 +105,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
_log.debug("Trying to convert HTML...")
if self.is_valid():
+ self.content_layer = ContentLayer.FURNITURE
+
assert self.soup is not None
content = self.soup.body or self.soup
# Replace
tags with newline characters
@@ -117,6 +121,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return doc
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
+
# Iterate over elements in the body of the document
text: str = ""
for element in tag.children:
@@ -143,8 +148,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if text and tag.name in ["div"]:
doc.add_text(
parent=self.parents[self.level],
- label=DocItemLabel.PARAGRAPH,
+ label=DocItemLabel.TEXT,
text=text,
+ content_layer=self.content_layer,
)
text = ""
@@ -197,12 +203,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text = element.text.strip()
if hlevel == 1:
- for key in self.parents.keys():
+ self.content_layer = ContentLayer.BODY
+
+ for key, in self.parents.keys():
self.parents[key] = None
self.level = 1
self.parents[self.level] = doc.add_text(
- parent=self.parents[0], label=DocItemLabel.TITLE, text=text
+ parent=self.parents[0], label=DocItemLabel.TITLE, text=text, content_layer=self.content_layer
)
else:
if hlevel > self.level:
@@ -213,6 +221,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
+ content_layer=self.content_layer
)
self.level = hlevel
@@ -228,6 +237,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[hlevel - 1],
text=text,
level=hlevel,
+ content_layer=self.content_layer
)
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
@@ -236,16 +246,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return
text = element.text.strip()
if text:
- doc.add_code(parent=self.parents[self.level], text=text)
+ doc.add_code(parent=self.parents[self.level], text=text, content_layer=self.content_layer)
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p)."""
if element.text is None:
return
text = element.text.strip()
- label = DocItemLabel.PARAGRAPH
+ label = DocItemLabel.TEXT
if text:
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
+ doc.add_text(parent=self.parents[self.level], label=label, text=text, content_layer=self.content_layer)
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items."""
@@ -253,7 +263,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if element.name == "ul":
# create a list group
self.parents[self.level + 1] = doc.add_group(
- parent=self.parents[self.level], name="list", label=GroupLabel.LIST
+ parent=self.parents[self.level], name="list", label=GroupLabel.LIST, content_layer=self.content_layer
)
elif element.name == "ol":
start_attr = element.get("start")
@@ -267,6 +277,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[self.level],
name="ordered list" + (f" start {start}" if start != 1 else ""),
label=GroupLabel.ORDERED_LIST,
+ content_layer=self.content_layer
)
self.level += 1
@@ -315,6 +326,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
enumerated=enumerated,
marker=marker,
parent=parent,
+ content_layer=self.content_layer
)
self.level += 1
@@ -336,6 +348,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
enumerated=enumerated,
marker=marker,
parent=parent,
+ content_layer=self.content_layer
)
else:
_log.debug(f"list-item has no text: {element}")
@@ -439,7 +452,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
table_data = HTMLDocumentBackend.parse_table_data(element)
if table_data is not None:
- doc.add_table(data=table_data, parent=self.parents[self.level])
+ doc.add_table(data=table_data, parent=self.parents[self.level], content_layer=self.content_layer)
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from