mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
reformatted code of html backend
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
e5e00674e1
commit
0cba30e254
@ -5,8 +5,6 @@ from typing import Final, Optional, Union, cast
|
|||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
||||||
from bs4.element import PreformattedString
|
from bs4.element import PreformattedString
|
||||||
|
|
||||||
from docling_core.types.doc.document import ContentLayer
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItem,
|
DocItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -17,6 +15,7 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
|
from docling_core.types.doc.document import ContentLayer
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
@ -106,7 +105,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
self.content_layer = ContentLayer.FURNITURE
|
self.content_layer = ContentLayer.FURNITURE
|
||||||
|
|
||||||
assert self.soup is not None
|
assert self.soup is not None
|
||||||
content = self.soup.body or self.soup
|
content = self.soup.body or self.soup
|
||||||
# Replace <br> tags with newline characters
|
# Replace <br> tags with newline characters
|
||||||
@ -204,13 +203,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
if hlevel == 1:
|
if hlevel == 1:
|
||||||
self.content_layer = ContentLayer.BODY
|
self.content_layer = ContentLayer.BODY
|
||||||
|
|
||||||
for key, in self.parents.keys():
|
for key in self.parents.keys():
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
|
|
||||||
self.level = 1
|
self.level = 1
|
||||||
self.parents[self.level] = doc.add_text(
|
self.parents[self.level] = doc.add_text(
|
||||||
parent=self.parents[0], label=DocItemLabel.TITLE, text=text, content_layer=self.content_layer
|
parent=self.parents[0],
|
||||||
|
label=DocItemLabel.TITLE,
|
||||||
|
text=text,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if hlevel > self.level:
|
if hlevel > self.level:
|
||||||
@ -221,7 +223,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
name=f"header-{i}",
|
name=f"header-{i}",
|
||||||
label=GroupLabel.SECTION,
|
label=GroupLabel.SECTION,
|
||||||
parent=self.parents[i - 1],
|
parent=self.parents[i - 1],
|
||||||
content_layer=self.content_layer
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
self.level = hlevel
|
self.level = hlevel
|
||||||
|
|
||||||
@ -237,7 +239,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
parent=self.parents[hlevel - 1],
|
parent=self.parents[hlevel - 1],
|
||||||
text=text,
|
text=text,
|
||||||
level=hlevel,
|
level=hlevel,
|
||||||
content_layer=self.content_layer
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
@ -246,7 +248,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return
|
return
|
||||||
text = element.text.strip()
|
text = element.text.strip()
|
||||||
if text:
|
if text:
|
||||||
doc.add_code(parent=self.parents[self.level], text=text, content_layer=self.content_layer)
|
doc.add_code(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
text=text,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
|
||||||
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
"""Handles paragraph tags (p)."""
|
"""Handles paragraph tags (p)."""
|
||||||
@ -255,7 +261,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text = element.text.strip()
|
text = element.text.strip()
|
||||||
label = DocItemLabel.TEXT
|
label = DocItemLabel.TEXT
|
||||||
if text:
|
if text:
|
||||||
doc.add_text(parent=self.parents[self.level], label=label, text=text, content_layer=self.content_layer)
|
doc.add_text(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
label=label,
|
||||||
|
text=text,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
|
||||||
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
"""Handles list tags (ul, ol) and their list items."""
|
"""Handles list tags (ul, ol) and their list items."""
|
||||||
@ -263,7 +274,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if element.name == "ul":
|
if element.name == "ul":
|
||||||
# create a list group
|
# create a list group
|
||||||
self.parents[self.level + 1] = doc.add_group(
|
self.parents[self.level + 1] = doc.add_group(
|
||||||
parent=self.parents[self.level], name="list", label=GroupLabel.LIST, content_layer=self.content_layer
|
parent=self.parents[self.level],
|
||||||
|
name="list",
|
||||||
|
label=GroupLabel.LIST,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elif element.name == "ol":
|
elif element.name == "ol":
|
||||||
start_attr = element.get("start")
|
start_attr = element.get("start")
|
||||||
@ -277,7 +291,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
parent=self.parents[self.level],
|
parent=self.parents[self.level],
|
||||||
name="ordered list" + (f" start {start}" if start != 1 else ""),
|
name="ordered list" + (f" start {start}" if start != 1 else ""),
|
||||||
label=GroupLabel.ORDERED_LIST,
|
label=GroupLabel.ORDERED_LIST,
|
||||||
content_layer=self.content_layer
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
|
|
||||||
@ -326,7 +340,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enumerated=enumerated,
|
enumerated=enumerated,
|
||||||
marker=marker,
|
marker=marker,
|
||||||
parent=parent,
|
parent=parent,
|
||||||
content_layer=self.content_layer
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
|
|
||||||
@ -348,7 +362,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enumerated=enumerated,
|
enumerated=enumerated,
|
||||||
marker=marker,
|
marker=marker,
|
||||||
parent=parent,
|
parent=parent,
|
||||||
content_layer=self.content_layer
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
_log.debug(f"list-item has no text: {element}")
|
_log.debug(f"list-item has no text: {element}")
|
||||||
@ -452,7 +466,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
table_data = HTMLDocumentBackend.parse_table_data(element)
|
table_data = HTMLDocumentBackend.parse_table_data(element)
|
||||||
|
|
||||||
if table_data is not None:
|
if table_data is not None:
|
||||||
doc.add_table(data=table_data, parent=self.parents[self.level], content_layer=self.content_layer)
|
doc.add_table(
|
||||||
|
data=table_data,
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
|
||||||
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
||||||
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
||||||
@ -492,23 +510,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
contains_captions = element.find(["figcaption"])
|
contains_captions = element.find(["figcaption"])
|
||||||
if not isinstance(contains_captions, Tag):
|
if not isinstance(contains_captions, Tag):
|
||||||
doc.add_picture(parent=self.parents[self.level], caption=None, content_layer=self.content_layer)
|
doc.add_picture(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
caption=None,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
texts = []
|
texts = []
|
||||||
for item in contains_captions:
|
for item in contains_captions:
|
||||||
texts.append(item.text)
|
texts.append(item.text)
|
||||||
|
|
||||||
fig_caption = doc.add_text(
|
fig_caption = doc.add_text(
|
||||||
label=DocItemLabel.CAPTION, text=("".join(texts)).strip(), content_layer=self.content_layer
|
label=DocItemLabel.CAPTION,
|
||||||
|
text=("".join(texts)).strip(),
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=self.parents[self.level],
|
parent=self.parents[self.level],
|
||||||
caption=fig_caption,
|
caption=fig_caption,
|
||||||
content_layer=self.content_layer
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
"""Handles image tags (img)."""
|
"""Handles image tags (img)."""
|
||||||
_log.warning(f"ignoring <img> tags at the moment: {element}")
|
_log.warning(f"ignoring <img> tags at the moment: {element}")
|
||||||
|
|
||||||
doc.add_picture(parent=self.parents[self.level], caption=None, content_layer=self.content_layer)
|
doc.add_picture(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
caption=None,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user