mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
refactor: add the contentlayer to html-backend (#1040)
* added the contentlayer to html-backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated the handle_image function Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted code of html backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * test(html): add more info if a test case fails Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * refactor(html): put parsed item in body if doc has no header In case an HTML does not have any header tag, all parsed items are placed in DoclingDocument's body content layer. HTML paragraphs ('p' tags) are parsed as text items with paragraph label. Update test ground truth accoring to the changes above. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: set TextItem label to 'text' instead of 'paragraph' Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
db3ceefd4a
commit
e25d557c06
@@ -15,6 +15,7 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
@@ -66,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
||||
"Could not initialize HTML backend for file with "
|
||||
f"hash {self.document_hash}."
|
||||
) from e
|
||||
|
||||
@override
|
||||
@@ -109,14 +111,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
# TODO: remove style to avoid losing text from tags like i, b, span, ...
|
||||
for br in content("br"):
|
||||
br.replace_with(NavigableString("\n"))
|
||||
|
||||
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
|
||||
self.content_layer = (
|
||||
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
||||
)
|
||||
self.walk(content, doc)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||
f"Cannot convert doc with {self.document_hash} because the backend "
|
||||
"failed to init."
|
||||
)
|
||||
return doc
|
||||
|
||||
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
|
||||
# Iterate over elements in the body of the document
|
||||
text: str = ""
|
||||
for element in tag.children:
|
||||
@@ -143,8 +152,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if text and tag.name in ["div"]:
|
||||
doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
label=DocItemLabel.TEXT,
|
||||
text=text,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
text = ""
|
||||
|
||||
@@ -166,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif tag.name == "figure":
|
||||
self.handle_figure(tag, doc)
|
||||
elif tag.name == "img":
|
||||
self.handle_image(doc)
|
||||
self.handle_image(tag, doc)
|
||||
else:
|
||||
self.walk(tag, doc)
|
||||
|
||||
@@ -197,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
text = element.text.strip()
|
||||
|
||||
if hlevel == 1:
|
||||
self.content_layer = ContentLayer.BODY
|
||||
|
||||
for key in self.parents.keys():
|
||||
self.parents[key] = None
|
||||
|
||||
self.level = 1
|
||||
self.parents[self.level] = doc.add_text(
|
||||
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
||||
parent=self.parents[0],
|
||||
label=DocItemLabel.TITLE,
|
||||
text=text,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
else:
|
||||
if hlevel > self.level:
|
||||
@@ -213,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
name=f"header-{i}",
|
||||
label=GroupLabel.SECTION,
|
||||
parent=self.parents[i - 1],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
self.level = hlevel
|
||||
|
||||
@@ -228,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent=self.parents[hlevel - 1],
|
||||
text=text,
|
||||
level=hlevel,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
@@ -236,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
text = element.text.strip()
|
||||
if text:
|
||||
doc.add_code(parent=self.parents[self.level], text=text)
|
||||
doc.add_code(
|
||||
parent=self.parents[self.level],
|
||||
text=text,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles paragraph tags (p)."""
|
||||
if element.text is None:
|
||||
return
|
||||
text = element.text.strip()
|
||||
label = DocItemLabel.PARAGRAPH
|
||||
if text:
|
||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||
doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TEXT,
|
||||
text=text,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles list tags (ul, ol) and their list items."""
|
||||
@@ -253,7 +278,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if element.name == "ul":
|
||||
# create a list group
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
||||
parent=self.parents[self.level],
|
||||
name="list",
|
||||
label=GroupLabel.LIST,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
elif element.name == "ol":
|
||||
start_attr = element.get("start")
|
||||
@@ -267,6 +295,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent=self.parents[self.level],
|
||||
name="ordered list" + (f" start {start}" if start != 1 else ""),
|
||||
label=GroupLabel.ORDERED_LIST,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
@@ -315,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=parent,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
@@ -336,6 +366,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=parent,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
else:
|
||||
_log.debug(f"list-item has no text: {element}")
|
||||
@@ -439,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
table_data = HTMLDocumentBackend.parse_table_data(element)
|
||||
|
||||
if table_data is not None:
|
||||
doc.add_table(data=table_data, parent=self.parents[self.level])
|
||||
doc.add_table(
|
||||
data=table_data,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
||||
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
||||
@@ -479,20 +514,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
contains_captions = element.find(["figcaption"])
|
||||
if not isinstance(contains_captions, Tag):
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
||||
doc.add_picture(
|
||||
parent=self.parents[self.level],
|
||||
caption=None,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
else:
|
||||
texts = []
|
||||
for item in contains_captions:
|
||||
texts.append(item.text)
|
||||
|
||||
fig_caption = doc.add_text(
|
||||
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
||||
label=DocItemLabel.CAPTION,
|
||||
text=("".join(texts)).strip(),
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
doc.add_picture(
|
||||
parent=self.parents[self.level],
|
||||
caption=fig_caption,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
def handle_image(self, doc: DoclingDocument) -> None:
|
||||
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles image tags (img)."""
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
||||
_log.debug(f"ignoring <img> tags at the moment: {element}")
|
||||
|
||||
doc.add_picture(
|
||||
parent=self.parents[self.level],
|
||||
caption=None,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user