refactor: add the contentlayer to html-backend (#1040)

* added the contentlayer to html-backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the handle_image function

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* reformatted code of html backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* test(html): add more info if a test case fails

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* refactor(html): put parsed item in body if doc has no header

In case an HTML does not have any header tag, all parsed items are placed in
DoclingDocument's body content layer.
HTML paragraphs ('p' tags) are parsed as text items with paragraph label.
Update test ground truth accoring to the changes above.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* chore: set TextItem label to 'text' instead of 'paragraph'

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Peter W. J. Staar
2025-03-02 10:37:53 -05:00
committed by GitHub
parent db3ceefd4a
commit e25d557c06
13 changed files with 623 additions and 709 deletions

View File

@@ -15,6 +15,7 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
from docling_core.types.doc.document import ContentLayer
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -66,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
raise RuntimeError(
f"Could not initialize HTML backend for file with hash {self.document_hash}."
"Could not initialize HTML backend for file with "
f"hash {self.document_hash}."
) from e
@override
@@ -109,14 +111,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# TODO: remove style to avoid losing text from tags like i, b, span, ...
for br in content("br"):
br.replace_with(NavigableString("\n"))
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
self.content_layer = (
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
)
self.walk(content, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
f"Cannot convert doc with {self.document_hash} because the backend "
"failed to init."
)
return doc
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
# Iterate over elements in the body of the document
text: str = ""
for element in tag.children:
@@ -143,8 +152,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if text and tag.name in ["div"]:
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.PARAGRAPH,
label=DocItemLabel.TEXT,
text=text,
content_layer=self.content_layer,
)
text = ""
@@ -166,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif tag.name == "figure":
self.handle_figure(tag, doc)
elif tag.name == "img":
self.handle_image(doc)
self.handle_image(tag, doc)
else:
self.walk(tag, doc)
@@ -197,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text = element.text.strip()
if hlevel == 1:
self.content_layer = ContentLayer.BODY
for key in self.parents.keys():
self.parents[key] = None
self.level = 1
self.parents[self.level] = doc.add_text(
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
parent=self.parents[0],
label=DocItemLabel.TITLE,
text=text,
content_layer=self.content_layer,
)
else:
if hlevel > self.level:
@@ -213,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
content_layer=self.content_layer,
)
self.level = hlevel
@@ -228,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[hlevel - 1],
text=text,
level=hlevel,
content_layer=self.content_layer,
)
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
@@ -236,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return
text = element.text.strip()
if text:
doc.add_code(parent=self.parents[self.level], text=text)
doc.add_code(
parent=self.parents[self.level],
text=text,
content_layer=self.content_layer,
)
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.PARAGRAPH
if text:
doc.add_text(parent=self.parents[self.level], label=label, text=text)
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=text,
content_layer=self.content_layer,
)
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items."""
@@ -253,7 +278,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if element.name == "ul":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
parent=self.parents[self.level],
name="list",
label=GroupLabel.LIST,
content_layer=self.content_layer,
)
elif element.name == "ol":
start_attr = element.get("start")
@@ -267,6 +295,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[self.level],
name="ordered list" + (f" start {start}" if start != 1 else ""),
label=GroupLabel.ORDERED_LIST,
content_layer=self.content_layer,
)
self.level += 1
@@ -315,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
enumerated=enumerated,
marker=marker,
parent=parent,
content_layer=self.content_layer,
)
self.level += 1
@@ -336,6 +366,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
enumerated=enumerated,
marker=marker,
parent=parent,
content_layer=self.content_layer,
)
else:
_log.debug(f"list-item has no text: {element}")
@@ -439,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
table_data = HTMLDocumentBackend.parse_table_data(element)
if table_data is not None:
doc.add_table(data=table_data, parent=self.parents[self.level])
doc.add_table(
data=table_data,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from <ul> or <ol> with proper indentation."""
@@ -479,20 +514,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
contains_captions = element.find(["figcaption"])
if not isinstance(contains_captions, Tag):
doc.add_picture(parent=self.parents[self.level], caption=None)
doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)
else:
texts = []
for item in contains_captions:
texts.append(item.text)
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
label=DocItemLabel.CAPTION,
text=("".join(texts)).strip(),
content_layer=self.content_layer,
)
doc.add_picture(
parent=self.parents[self.level],
caption=fig_caption,
content_layer=self.content_layer,
)
def handle_image(self, doc: DoclingDocument) -> None:
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None)
_log.debug(f"ignoring <img> tags at the moment: {element}")
doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)