reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-11-05 07:25:21 +01:00
parent 3257034631
commit ddd1474c8d

View File

@ -1,12 +1,11 @@
import re
import logging import logging
import re
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
@ -171,14 +170,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if self.is_body(): if self.is_body():
self.handle_svg(element, idx, doc) self.handle_svg(element, idx, doc)
elif isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'): elif (
isinstance(element, Tag)
and element.name in ["section"]
and element.has_attr("data-content")
):
try: try:
# Decode the data-content attribute # Decode the data-content attribute
# data_content = html.unescape(element['data-content']) # data_content = html.unescape(element['data-content'])
data_content = element['data-content'] data_content = element["data-content"]
# Parse the decoded HTML content # Parse the decoded HTML content
content_soup = BeautifulSoup(data_content, 'html.parser') content_soup = BeautifulSoup(data_content, "html.parser")
for jdx, _ in enumerate(content_soup): for jdx, _ in enumerate(content_soup):
self.analyse_element(_, jdx, doc) self.analyse_element(_, jdx, doc)
@ -311,7 +314,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Flatten text, remove break lines: # Flatten text, remove break lines:
text = text.replace("\n", " ").replace("\r", "") text = text.replace("\n", " ").replace("\r", "")
text = " ".join(text.split()).strip() text = " ".join(text.split()).strip()
text = re.sub(r'\s{2,}', ' ', text) text = re.sub(r"\s{2,}", " ", text)
marker = "" marker = ""
enumerated = False enumerated = False
@ -337,7 +340,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element.text, str): elif isinstance(element.text, str):
text = element.text.strip() text = element.text.strip()
text = text.replace("\n", " ").replace("\r", "") text = text.replace("\n", " ").replace("\r", "")
text = re.sub(r'\s{2,}', ' ', text) text = re.sub(r"\s{2,}", " ", text)
marker = "" marker = ""
enumerated = False enumerated = False