mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
work-in-progress: dealing with in attributes of html elements
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
473ad9a032
commit
5fc4d5bd3d
@ -4,6 +4,8 @@ from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
@ -25,7 +27,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self,
|
||||
in_doc: "InputDocument",
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
skip_furniture: bool = False,
|
||||
skip_furniture: bool = True,
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
_log.debug("About to init HTML backend...")
|
||||
@ -45,10 +47,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
print("BytesIO")
|
||||
self.soup = BeautifulSoup(text_stream, "html.parser")
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
print("file")
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
|
||||
with open("./scratch/file.html", "w") as fw:
|
||||
fw.write(html_content)
|
||||
|
||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
@ -101,26 +109,37 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def walk(self, element, doc):
|
||||
try:
|
||||
if isinstance(element, Tag) and any(element.children):
|
||||
# Iterate over elements in the body of the document
|
||||
for idx, element in enumerate(element.children):
|
||||
for idx, child in enumerate(element.children):
|
||||
try:
|
||||
self.analyse_element(element, idx, doc)
|
||||
except Exception as exc_child:
|
||||
self.analyse_element(child, idx, doc)
|
||||
except Exception as exc:
|
||||
_log.error(f" -> error treating child: {exc}")
|
||||
raise exc
|
||||
|
||||
_log.error(" -> error treating child: ", exc_child)
|
||||
_log.error(" => element: ", element, "\n")
|
||||
raise exc_child
|
||||
elif isinstance(element, Tag):
|
||||
try:
|
||||
self.analyse_element(element, 0, doc)
|
||||
except Exception as exc:
|
||||
_log.error(f" -> error treating elem: {exc}")
|
||||
raise exc
|
||||
else:
|
||||
_log.warn(f"ignoring element of type {type(element)}")
|
||||
|
||||
except Exception as exc:
|
||||
_log.warn(f"error walking element: {type(element)}")
|
||||
pass
|
||||
|
||||
return doc
|
||||
|
||||
def analyse_element(self, element, idx, doc):
|
||||
"""
|
||||
|
||||
if element.name!=None:
|
||||
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
||||
"""
|
||||
#_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
||||
print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
||||
|
||||
#print(element.name)
|
||||
|
||||
if element.name in self.labels:
|
||||
self.labels[element.name] += 1
|
||||
@ -134,8 +153,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_header(element, idx, doc)
|
||||
elif element.name in ["p"]:
|
||||
print(" --> detected ...")
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_paragraph(element, idx, doc)
|
||||
print(" --> registered ...")
|
||||
|
||||
elif element.name in ["ul", "ol"]:
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_list(element, idx, doc)
|
||||
@ -151,6 +173,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif element.name == "img":
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_image(element, idx, doc)
|
||||
elif element.name == "svg":
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
#self.handle_image(element, idx, doc)
|
||||
_log.warn("Add `svg` elements")
|
||||
|
||||
elif True and isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'):
|
||||
try:
|
||||
#print("\n\n\nattempt decoding: ", element['data-content'])
|
||||
|
||||
# Decode the data-content attribute
|
||||
#data_content = html.unescape(element['data-content'])
|
||||
#print(data_content)
|
||||
|
||||
data_content = element['data-content']
|
||||
|
||||
# Parse the decoded HTML content
|
||||
content_soup = BeautifulSoup(data_content, 'html.parser')
|
||||
print("\n\n\nsoup: ", content_soup)
|
||||
|
||||
for jdx, _ in enumerate(content_soup):
|
||||
print(_)
|
||||
self.analyse_element(_, jdx, doc)
|
||||
except:
|
||||
_log.warn("could not parse the `data-content` attribute")
|
||||
|
||||
self.walk(element, doc)
|
||||
|
||||
else:
|
||||
self.walk(element, doc)
|
||||
|
||||
@ -229,11 +278,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def handle_paragraph(self, element, idx, doc):
|
||||
"""Handles paragraph tags (p)."""
|
||||
if element.text is None:
|
||||
print(" -> text is None ...")
|
||||
return
|
||||
text = element.text.strip()
|
||||
print("doc is adding paragraph: ", text)
|
||||
|
||||
label = DocItemLabel.PARAGRAPH
|
||||
if len(text) == 0:
|
||||
print(" -> text is zero length ...")
|
||||
return
|
||||
print("doc is adding paragraph: ", text)
|
||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||
|
||||
def handle_list(self, element, idx, doc):
|
||||
|
Loading…
Reference in New Issue
Block a user