mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
replace new lines and double spaces in list-items with single spaces
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
f276c0cc90
commit
3257034631
@ -1,3 +1,4 @@
|
||||
import re
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
@ -46,17 +47,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
_log.debug("reading from BytesIO")
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
self.soup = BeautifulSoup(text_stream, "html.parser")
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
_log.debug("reading from file")
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as fr:
|
||||
html_content = fr.read()
|
||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
||||
f"Could not initialize HTML backend for file with hash '{self.document_hash}'."
|
||||
) from e
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
@ -310,6 +311,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Flatten text, remove break lines:
|
||||
text = text.replace("\n", " ").replace("\r", "")
|
||||
text = " ".join(text.split()).strip()
|
||||
text = re.sub(r'\s{2,}', ' ', text)
|
||||
|
||||
marker = ""
|
||||
enumerated = False
|
||||
@ -334,6 +336,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif isinstance(element.text, str):
|
||||
text = element.text.strip()
|
||||
text = text.replace("\n", " ").replace("\r", "")
|
||||
text = re.sub(r'\s{2,}', ' ', text)
|
||||
|
||||
marker = ""
|
||||
enumerated = False
|
||||
|
Loading…
Reference in New Issue
Block a user