replace new lines and double spaces in list-items with single spaces

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-11-05 07:24:31 +01:00
parent f276c0cc90
commit 3257034631

View File

@ -1,3 +1,4 @@
import re
import logging import logging
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
@ -46,17 +47,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
_log.debug("reading from BytesIO") _log.debug("reading from BytesIO")
text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.soup = BeautifulSoup(text_stream, "html.parser") self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path): if isinstance(self.path_or_stream, Path):
_log.debug("reading from file") _log.debug("reading from file")
with open(self.path_or_stream, "r", encoding="utf-8") as f: with open(self.path_or_stream, "r", encoding="utf-8") as fr:
html_content = f.read() html_content = fr.read()
self.soup = BeautifulSoup(html_content, "html.parser") self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
f"Could not initialize HTML backend for file with hash {self.document_hash}." f"Could not initialize HTML backend for file with hash '{self.document_hash}'."
) from e ) from e
def is_valid(self) -> bool: def is_valid(self) -> bool:
@ -310,7 +311,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Flatten text, remove break lines: # Flatten text, remove break lines:
text = text.replace("\n", " ").replace("\r", "") text = text.replace("\n", " ").replace("\r", "")
text = " ".join(text.split()).strip() text = " ".join(text.split()).strip()
text = re.sub(r'\s{2,}', ' ', text)
marker = "" marker = ""
enumerated = False enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST: if parent_list_label == GroupLabel.ORDERED_LIST:
@ -334,7 +336,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element.text, str): elif isinstance(element.text, str):
text = element.text.strip() text = element.text.strip()
text = text.replace("\n", " ").replace("\r", "")
text = re.sub(r'\s{2,}', ' ', text)
marker = "" marker = ""
enumerated = False enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST: if parent_list_label == GroupLabel.ORDERED_LIST: