reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-11-05 07:25:21 +01:00
parent 3257034631
commit ddd1474c8d

View File

@ -1,12 +1,11 @@
import re
import logging import logging
import re
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
@ -123,7 +122,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
raise exc raise exc
else: else:
_log.debug(f"ignoring element of type {type(element)}") _log.debug(f"ignoring element of type {type(element)}")
except Exception as exc: except Exception as exc:
_log.debug(f"error walking element: {type(element)}") _log.debug(f"error walking element: {type(element)}")
pass pass
@ -132,12 +131,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def is_body(self): def is_body(self):
return (not self.contains_h1) or (self.contains_h1 and self.detected_h1) return (not self.contains_h1) or (self.contains_h1 and self.detected_h1)
def analyse_element(self, element, idx, doc): def analyse_element(self, element, idx, doc):
if element.name!=None: if element.name != None:
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") _log.debug("\t" * self.level, idx, "\t", f"{element.name} ({self.level})")
if element.name in self.labels: if element.name in self.labels:
self.labels[element.name] += 1 self.labels[element.name] += 1
else: else:
@ -171,14 +170,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if self.is_body(): if self.is_body():
self.handle_svg(element, idx, doc) self.handle_svg(element, idx, doc)
elif isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'): elif (
isinstance(element, Tag)
and element.name in ["section"]
and element.has_attr("data-content")
):
try: try:
# Decode the data-content attribute # Decode the data-content attribute
#data_content = html.unescape(element['data-content']) # data_content = html.unescape(element['data-content'])
data_content = element['data-content'] data_content = element["data-content"]
# Parse the decoded HTML content # Parse the decoded HTML content
content_soup = BeautifulSoup(data_content, 'html.parser') content_soup = BeautifulSoup(data_content, "html.parser")
for jdx, _ in enumerate(content_soup): for jdx, _ in enumerate(content_soup):
self.analyse_element(_, jdx, doc) self.analyse_element(_, jdx, doc)
@ -186,7 +189,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
_log.debug("could not parse the `data-content` attribute") _log.debug("could not parse the `data-content` attribute")
self.walk(element, doc) self.walk(element, doc)
else: else:
self.walk(element, doc) self.walk(element, doc)
@ -264,9 +267,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_paragraph(self, element, idx, doc): def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p).""" """Handles paragraph tags (p)."""
if element.text is None: if element.text is None:
return return
text = element.text.strip() text = element.text.strip()
if len(text) == 0: if len(text) == 0:
return return
@ -311,8 +314,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Flatten text, remove break lines: # Flatten text, remove break lines:
text = text.replace("\n", " ").replace("\r", "") text = text.replace("\n", " ").replace("\r", "")
text = " ".join(text.split()).strip() text = " ".join(text.split()).strip()
text = re.sub(r'\s{2,}', ' ', text) text = re.sub(r"\s{2,}", " ", text)
marker = "" marker = ""
enumerated = False enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST: if parent_list_label == GroupLabel.ORDERED_LIST:
@ -337,8 +340,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element.text, str): elif isinstance(element.text, str):
text = element.text.strip() text = element.text.strip()
text = text.replace("\n", " ").replace("\r", "") text = text.replace("\n", " ").replace("\r", "")
text = re.sub(r'\s{2,}', ' ', text) text = re.sub(r"\s{2,}", " ", text)
marker = "" marker = ""
enumerated = False enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST: if parent_list_label == GroupLabel.ORDERED_LIST:
@ -491,7 +494,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_image(self, element, idx, doc): def handle_image(self, element, idx, doc):
"""Handles image tags (img).""" """Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None) doc.add_picture(parent=self.parents[self.level], caption=None)
def handle_svg(self, element, idx, doc): def handle_svg(self, element, idx, doc):
"""Handles svg tags.""" """Handles svg tags."""
doc.add_picture(parent=self.parents[self.level], caption=None) doc.add_picture(parent=self.parents[self.level], caption=None)