reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-11-05 07:25:21 +01:00
parent 3257034631
commit ddd1474c8d

View File

@ -1,12 +1,11 @@
import re
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from bs4 import BeautifulSoup
from bs4.element import Tag
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
@ -171,14 +170,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if self.is_body():
self.handle_svg(element, idx, doc)
elif isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'):
elif (
isinstance(element, Tag)
and element.name in ["section"]
and element.has_attr("data-content")
):
try:
# Decode the data-content attribute
# data_content = html.unescape(element['data-content'])
data_content = element['data-content']
data_content = element["data-content"]
# Parse the decoded HTML content
content_soup = BeautifulSoup(data_content, 'html.parser')
content_soup = BeautifulSoup(data_content, "html.parser")
for jdx, _ in enumerate(content_soup):
self.analyse_element(_, jdx, doc)
@ -311,7 +314,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Flatten text, remove break lines:
text = text.replace("\n", " ").replace("\r", "")
text = " ".join(text.split()).strip()
text = re.sub(r'\s{2,}', ' ', text)
text = re.sub(r"\s{2,}", " ", text)
marker = ""
enumerated = False
@ -337,7 +340,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element.text, str):
text = element.text.strip()
text = text.replace("\n", " ").replace("\r", "")
text = re.sub(r'\s{2,}', ' ', text)
text = re.sub(r"\s{2,}", " ", text)
marker = ""
enumerated = False