reformatted code of html backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2025-02-24 09:29:06 +01:00 committed by Cesar Berrospi Ramis
parent e5e00674e1
commit 0cba30e254

View File

@ -5,8 +5,6 @@ from typing import Final, Optional, Union, cast
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from bs4.element import PreformattedString from bs4.element import PreformattedString
from docling_core.types.doc.document import ContentLayer
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem, DocItem,
DocItemLabel, DocItemLabel,
@ -17,6 +15,7 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
from docling_core.types.doc.document import ContentLayer
from typing_extensions import override from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
@ -106,7 +105,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if self.is_valid(): if self.is_valid():
self.content_layer = ContentLayer.FURNITURE self.content_layer = ContentLayer.FURNITURE
assert self.soup is not None assert self.soup is not None
content = self.soup.body or self.soup content = self.soup.body or self.soup
# Replace <br> tags with newline characters # Replace <br> tags with newline characters
@ -204,13 +203,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if hlevel == 1: if hlevel == 1:
self.content_layer = ContentLayer.BODY self.content_layer = ContentLayer.BODY
for key, in self.parents.keys(): for key in self.parents.keys():
self.parents[key] = None self.parents[key] = None
self.level = 1 self.level = 1
self.parents[self.level] = doc.add_text( self.parents[self.level] = doc.add_text(
parent=self.parents[0], label=DocItemLabel.TITLE, text=text, content_layer=self.content_layer parent=self.parents[0],
label=DocItemLabel.TITLE,
text=text,
content_layer=self.content_layer,
) )
else: else:
if hlevel > self.level: if hlevel > self.level:
@ -221,7 +223,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
name=f"header-{i}", name=f"header-{i}",
label=GroupLabel.SECTION, label=GroupLabel.SECTION,
parent=self.parents[i - 1], parent=self.parents[i - 1],
content_layer=self.content_layer content_layer=self.content_layer,
) )
self.level = hlevel self.level = hlevel
@ -237,7 +239,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[hlevel - 1], parent=self.parents[hlevel - 1],
text=text, text=text,
level=hlevel, level=hlevel,
content_layer=self.content_layer content_layer=self.content_layer,
) )
def handle_code(self, element: Tag, doc: DoclingDocument) -> None: def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
@ -246,7 +248,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return return
text = element.text.strip() text = element.text.strip()
if text: if text:
doc.add_code(parent=self.parents[self.level], text=text, content_layer=self.content_layer) doc.add_code(
parent=self.parents[self.level],
text=text,
content_layer=self.content_layer,
)
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p).""" """Handles paragraph tags (p)."""
@ -255,7 +261,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text = element.text.strip() text = element.text.strip()
label = DocItemLabel.TEXT label = DocItemLabel.TEXT
if text: if text:
doc.add_text(parent=self.parents[self.level], label=label, text=text, content_layer=self.content_layer) doc.add_text(
parent=self.parents[self.level],
label=label,
text=text,
content_layer=self.content_layer,
)
def handle_list(self, element: Tag, doc: DoclingDocument) -> None: def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items.""" """Handles list tags (ul, ol) and their list items."""
@ -263,7 +274,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if element.name == "ul": if element.name == "ul":
# create a list group # create a list group
self.parents[self.level + 1] = doc.add_group( self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST, content_layer=self.content_layer parent=self.parents[self.level],
name="list",
label=GroupLabel.LIST,
content_layer=self.content_layer,
) )
elif element.name == "ol": elif element.name == "ol":
start_attr = element.get("start") start_attr = element.get("start")
@ -277,7 +291,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[self.level], parent=self.parents[self.level],
name="ordered list" + (f" start {start}" if start != 1 else ""), name="ordered list" + (f" start {start}" if start != 1 else ""),
label=GroupLabel.ORDERED_LIST, label=GroupLabel.ORDERED_LIST,
content_layer=self.content_layer content_layer=self.content_layer,
) )
self.level += 1 self.level += 1
@ -326,7 +340,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
enumerated=enumerated, enumerated=enumerated,
marker=marker, marker=marker,
parent=parent, parent=parent,
content_layer=self.content_layer content_layer=self.content_layer,
) )
self.level += 1 self.level += 1
@ -348,7 +362,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
enumerated=enumerated, enumerated=enumerated,
marker=marker, marker=marker,
parent=parent, parent=parent,
content_layer=self.content_layer content_layer=self.content_layer,
) )
else: else:
_log.debug(f"list-item has no text: {element}") _log.debug(f"list-item has no text: {element}")
@ -452,7 +466,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
table_data = HTMLDocumentBackend.parse_table_data(element) table_data = HTMLDocumentBackend.parse_table_data(element)
if table_data is not None: if table_data is not None:
doc.add_table(data=table_data, parent=self.parents[self.level], content_layer=self.content_layer) doc.add_table(
data=table_data,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]: def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from <ul> or <ol> with proper indentation.""" """Recursively extract text from <ul> or <ol> with proper indentation."""
@ -492,23 +510,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
contains_captions = element.find(["figcaption"]) contains_captions = element.find(["figcaption"])
if not isinstance(contains_captions, Tag): if not isinstance(contains_captions, Tag):
doc.add_picture(parent=self.parents[self.level], caption=None, content_layer=self.content_layer) doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)
else: else:
texts = [] texts = []
for item in contains_captions: for item in contains_captions:
texts.append(item.text) texts.append(item.text)
fig_caption = doc.add_text( fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=("".join(texts)).strip(), content_layer=self.content_layer label=DocItemLabel.CAPTION,
text=("".join(texts)).strip(),
content_layer=self.content_layer,
) )
doc.add_picture( doc.add_picture(
parent=self.parents[self.level], parent=self.parents[self.level],
caption=fig_caption, caption=fig_caption,
content_layer=self.content_layer content_layer=self.content_layer,
) )
def handle_image(self, element: Tag, doc: DoclingDocument) -> None: def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles image tags (img).""" """Handles image tags (img)."""
_log.warning(f"ignoring <img> tags at the moment: {element}") _log.warning(f"ignoring <img> tags at the moment: {element}")
doc.add_picture(parent=self.parents[self.level], caption=None, content_layer=self.content_layer) doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)