mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
replace new lines and double spaces in list-items with single spaces
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
f276c0cc90
commit
3257034631
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
import logging
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -46,17 +47,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
|
||||||
_log.debug("reading from BytesIO")
|
_log.debug("reading from BytesIO")
|
||||||
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||||
self.soup = BeautifulSoup(text_stream, "html.parser")
|
self.soup = BeautifulSoup(text_stream, "html.parser")
|
||||||
if isinstance(self.path_or_stream, Path):
|
if isinstance(self.path_or_stream, Path):
|
||||||
_log.debug("reading from file")
|
_log.debug("reading from file")
|
||||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
with open(self.path_or_stream, "r", encoding="utf-8") as fr:
|
||||||
html_content = f.read()
|
html_content = fr.read()
|
||||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
f"Could not initialize HTML backend for file with hash '{self.document_hash}'."
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
@ -310,7 +311,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Flatten text, remove break lines:
|
# Flatten text, remove break lines:
|
||||||
text = text.replace("\n", " ").replace("\r", "")
|
text = text.replace("\n", " ").replace("\r", "")
|
||||||
text = " ".join(text.split()).strip()
|
text = " ".join(text.split()).strip()
|
||||||
|
text = re.sub(r'\s{2,}', ' ', text)
|
||||||
|
|
||||||
marker = ""
|
marker = ""
|
||||||
enumerated = False
|
enumerated = False
|
||||||
if parent_list_label == GroupLabel.ORDERED_LIST:
|
if parent_list_label == GroupLabel.ORDERED_LIST:
|
||||||
@ -334,7 +336,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element.text, str):
|
elif isinstance(element.text, str):
|
||||||
text = element.text.strip()
|
text = element.text.strip()
|
||||||
|
text = text.replace("\n", " ").replace("\r", "")
|
||||||
|
text = re.sub(r'\s{2,}', ' ', text)
|
||||||
|
|
||||||
marker = ""
|
marker = ""
|
||||||
enumerated = False
|
enumerated = False
|
||||||
if parent_list_label == GroupLabel.ORDERED_LIST:
|
if parent_list_label == GroupLabel.ORDERED_LIST:
|
||||||
|
Loading…
Reference in New Issue
Block a user