added the contentlayer to html-backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2025-02-23 07:01:18 +01:00 committed by Cesar Berrospi Ramis
parent db3ceefd4a
commit 252bd83066

View File

@ -5,6 +5,8 @@ from typing import Final, Optional, Union, cast
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from bs4.element import PreformattedString from bs4.element import PreformattedString
from docling_core.types.doc.document import ContentLayer
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem, DocItem,
DocItemLabel, DocItemLabel,
@ -103,6 +105,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
_log.debug("Trying to convert HTML...") _log.debug("Trying to convert HTML...")
if self.is_valid(): if self.is_valid():
self.content_layer = ContentLayer.FURNITURE
assert self.soup is not None assert self.soup is not None
content = self.soup.body or self.soup content = self.soup.body or self.soup
# Replace <br> tags with newline characters # Replace <br> tags with newline characters
@ -117,6 +121,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return doc return doc
def walk(self, tag: Tag, doc: DoclingDocument) -> None: def walk(self, tag: Tag, doc: DoclingDocument) -> None:
# Iterate over elements in the body of the document # Iterate over elements in the body of the document
text: str = "" text: str = ""
for element in tag.children: for element in tag.children:
@ -143,8 +148,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if text and tag.name in ["div"]: if text and tag.name in ["div"]:
doc.add_text( doc.add_text(
parent=self.parents[self.level], parent=self.parents[self.level],
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.TEXT,
text=text, text=text,
content_layer=self.content_layer,
) )
text = "" text = ""
@ -197,12 +203,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text = element.text.strip() text = element.text.strip()
if hlevel == 1: if hlevel == 1:
for key in self.parents.keys(): self.content_layer = ContentLayer.BODY
for key, in self.parents.keys():
self.parents[key] = None self.parents[key] = None
self.level = 1 self.level = 1
self.parents[self.level] = doc.add_text( self.parents[self.level] = doc.add_text(
parent=self.parents[0], label=DocItemLabel.TITLE, text=text parent=self.parents[0], label=DocItemLabel.TITLE, text=text, content_layer=self.content_layer
) )
else: else:
if hlevel > self.level: if hlevel > self.level:
@ -213,6 +221,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
name=f"header-{i}", name=f"header-{i}",
label=GroupLabel.SECTION, label=GroupLabel.SECTION,
parent=self.parents[i - 1], parent=self.parents[i - 1],
content_layer=self.content_layer
) )
self.level = hlevel self.level = hlevel
@ -228,6 +237,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[hlevel - 1], parent=self.parents[hlevel - 1],
text=text, text=text,
level=hlevel, level=hlevel,
content_layer=self.content_layer
) )
def handle_code(self, element: Tag, doc: DoclingDocument) -> None: def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
@ -236,16 +246,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return return
text = element.text.strip() text = element.text.strip()
if text: if text:
doc.add_code(parent=self.parents[self.level], text=text) doc.add_code(parent=self.parents[self.level], text=text, content_layer=self.content_layer)
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p).""" """Handles paragraph tags (p)."""
if element.text is None: if element.text is None:
return return
text = element.text.strip() text = element.text.strip()
label = DocItemLabel.PARAGRAPH label = DocItemLabel.TEXT
if text: if text:
doc.add_text(parent=self.parents[self.level], label=label, text=text) doc.add_text(parent=self.parents[self.level], label=label, text=text, content_layer=self.content_layer)
def handle_list(self, element: Tag, doc: DoclingDocument) -> None: def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items.""" """Handles list tags (ul, ol) and their list items."""
@ -253,7 +263,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if element.name == "ul": if element.name == "ul":
# create a list group # create a list group
self.parents[self.level + 1] = doc.add_group( self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST parent=self.parents[self.level], name="list", label=GroupLabel.LIST, content_layer=self.content_layer
) )
elif element.name == "ol": elif element.name == "ol":
start_attr = element.get("start") start_attr = element.get("start")
@ -267,6 +277,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[self.level], parent=self.parents[self.level],
name="ordered list" + (f" start {start}" if start != 1 else ""), name="ordered list" + (f" start {start}" if start != 1 else ""),
label=GroupLabel.ORDERED_LIST, label=GroupLabel.ORDERED_LIST,
content_layer=self.content_layer
) )
self.level += 1 self.level += 1
@ -315,6 +326,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
enumerated=enumerated, enumerated=enumerated,
marker=marker, marker=marker,
parent=parent, parent=parent,
content_layer=self.content_layer
) )
self.level += 1 self.level += 1
@ -336,6 +348,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
enumerated=enumerated, enumerated=enumerated,
marker=marker, marker=marker,
parent=parent, parent=parent,
content_layer=self.content_layer
) )
else: else:
_log.debug(f"list-item has no text: {element}") _log.debug(f"list-item has no text: {element}")
@ -439,7 +452,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
table_data = HTMLDocumentBackend.parse_table_data(element) table_data = HTMLDocumentBackend.parse_table_data(element)
if table_data is not None: if table_data is not None:
doc.add_table(data=table_data, parent=self.parents[self.level]) doc.add_table(data=table_data, parent=self.parents[self.level], content_layer=self.content_layer)
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]: def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from <ul> or <ol> with proper indentation.""" """Recursively extract text from <ul> or <ol> with proper indentation."""
@ -479,20 +492,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
contains_captions = element.find(["figcaption"]) contains_captions = element.find(["figcaption"])
if not isinstance(contains_captions, Tag): if not isinstance(contains_captions, Tag):
doc.add_picture(parent=self.parents[self.level], caption=None) doc.add_picture(parent=self.parents[self.level], caption=None, content_layer=self.content_layer)
else: else:
texts = [] texts = []
for item in contains_captions: for item in contains_captions:
texts.append(item.text) texts.append(item.text)
fig_caption = doc.add_text( fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=("".join(texts)).strip() label=DocItemLabel.CAPTION, text=("".join(texts)).strip(), content_layer=self.content_layer
) )
doc.add_picture( doc.add_picture(
parent=self.parents[self.level], parent=self.parents[self.level],
caption=fig_caption, caption=fig_caption,
content_layer=self.content_layer
) )
def handle_image(self, doc: DoclingDocument) -> None: def handle_image(self, doc: DoclingDocument) -> None:
"""Handles image tags (img).""" """Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None) doc.add_picture(parent=self.parents[self.level], caption=None, content_layer=self.content_layer)