mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
added the contentlayer to html-backend
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
db3ceefd4a
commit
252bd83066
@ -5,6 +5,8 @@ from typing import Final, Optional, Union, cast
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
||||
from bs4.element import PreformattedString
|
||||
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
@ -103,6 +105,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
_log.debug("Trying to convert HTML...")
|
||||
|
||||
if self.is_valid():
|
||||
self.content_layer = ContentLayer.FURNITURE
|
||||
|
||||
assert self.soup is not None
|
||||
content = self.soup.body or self.soup
|
||||
# Replace <br> tags with newline characters
|
||||
@ -117,6 +121,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return doc
|
||||
|
||||
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
|
||||
# Iterate over elements in the body of the document
|
||||
text: str = ""
|
||||
for element in tag.children:
|
||||
@ -143,8 +148,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if text and tag.name in ["div"]:
|
||||
doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
label=DocItemLabel.TEXT,
|
||||
text=text,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
text = ""
|
||||
|
||||
@ -197,12 +203,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
text = element.text.strip()
|
||||
|
||||
if hlevel == 1:
|
||||
for key in self.parents.keys():
|
||||
self.content_layer = ContentLayer.BODY
|
||||
|
||||
for key, in self.parents.keys():
|
||||
self.parents[key] = None
|
||||
|
||||
self.level = 1
|
||||
self.parents[self.level] = doc.add_text(
|
||||
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
||||
parent=self.parents[0], label=DocItemLabel.TITLE, text=text, content_layer=self.content_layer
|
||||
)
|
||||
else:
|
||||
if hlevel > self.level:
|
||||
@ -213,6 +221,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
name=f"header-{i}",
|
||||
label=GroupLabel.SECTION,
|
||||
parent=self.parents[i - 1],
|
||||
content_layer=self.content_layer
|
||||
)
|
||||
self.level = hlevel
|
||||
|
||||
@ -228,6 +237,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent=self.parents[hlevel - 1],
|
||||
text=text,
|
||||
level=hlevel,
|
||||
content_layer=self.content_layer
|
||||
)
|
||||
|
||||
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
@ -236,16 +246,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
text = element.text.strip()
|
||||
if text:
|
||||
doc.add_code(parent=self.parents[self.level], text=text)
|
||||
doc.add_code(parent=self.parents[self.level], text=text, content_layer=self.content_layer)
|
||||
|
||||
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles paragraph tags (p)."""
|
||||
if element.text is None:
|
||||
return
|
||||
text = element.text.strip()
|
||||
label = DocItemLabel.PARAGRAPH
|
||||
label = DocItemLabel.TEXT
|
||||
if text:
|
||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||
doc.add_text(parent=self.parents[self.level], label=label, text=text, content_layer=self.content_layer)
|
||||
|
||||
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles list tags (ul, ol) and their list items."""
|
||||
@ -253,7 +263,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if element.name == "ul":
|
||||
# create a list group
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
||||
parent=self.parents[self.level], name="list", label=GroupLabel.LIST, content_layer=self.content_layer
|
||||
)
|
||||
elif element.name == "ol":
|
||||
start_attr = element.get("start")
|
||||
@ -267,6 +277,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent=self.parents[self.level],
|
||||
name="ordered list" + (f" start {start}" if start != 1 else ""),
|
||||
label=GroupLabel.ORDERED_LIST,
|
||||
content_layer=self.content_layer
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
@ -315,6 +326,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=parent,
|
||||
content_layer=self.content_layer
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
@ -336,6 +348,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=parent,
|
||||
content_layer=self.content_layer
|
||||
)
|
||||
else:
|
||||
_log.debug(f"list-item has no text: {element}")
|
||||
@ -439,7 +452,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
table_data = HTMLDocumentBackend.parse_table_data(element)
|
||||
|
||||
if table_data is not None:
|
||||
doc.add_table(data=table_data, parent=self.parents[self.level])
|
||||
doc.add_table(data=table_data, parent=self.parents[self.level], content_layer=self.content_layer)
|
||||
|
||||
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
||||
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
||||
@ -479,20 +492,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
contains_captions = element.find(["figcaption"])
|
||||
if not isinstance(contains_captions, Tag):
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None, content_layer=self.content_layer)
|
||||
else:
|
||||
texts = []
|
||||
for item in contains_captions:
|
||||
texts.append(item.text)
|
||||
|
||||
fig_caption = doc.add_text(
|
||||
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
||||
label=DocItemLabel.CAPTION, text=("".join(texts)).strip(), content_layer=self.content_layer
|
||||
)
|
||||
doc.add_picture(
|
||||
parent=self.parents[self.level],
|
||||
caption=fig_caption,
|
||||
content_layer=self.content_layer
|
||||
)
|
||||
|
||||
def handle_image(self, doc: DoclingDocument) -> None:
|
||||
"""Handles image tags (img)."""
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None, content_layer=self.content_layer)
|
||||
|
Loading…
Reference in New Issue
Block a user