diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index e46ce09a..b19baa68 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,3 +1,4 @@ +import re import logging from io import BytesIO from pathlib import Path @@ -46,17 +47,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): try: if isinstance(self.path_or_stream, BytesIO): - text_stream = self.path_or_stream.getvalue().decode("utf-8") _log.debug("reading from BytesIO") + text_stream = self.path_or_stream.getvalue().decode("utf-8") self.soup = BeautifulSoup(text_stream, "html.parser") if isinstance(self.path_or_stream, Path): _log.debug("reading from file") - with open(self.path_or_stream, "r", encoding="utf-8") as f: - html_content = f.read() + with open(self.path_or_stream, "r", encoding="utf-8") as fr: + html_content = fr.read() self.soup = BeautifulSoup(html_content, "html.parser") except Exception as e: raise RuntimeError( - f"Could not initialize HTML backend for file with hash {self.document_hash}." + f"Could not initialize HTML backend for file with hash '{self.document_hash}'." ) from e def is_valid(self) -> bool: @@ -310,7 +311,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): # Flatten text, remove break lines: text = text.replace("\n", " ").replace("\r", "") text = " ".join(text.split()).strip() - + text = re.sub(r'\s{2,}', ' ', text) + marker = "" enumerated = False if parent_list_label == GroupLabel.ORDERED_LIST: @@ -334,7 +336,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element.text, str): text = element.text.strip() - + text = text.replace("\n", " ").replace("\r", "") + text = re.sub(r'\s{2,}', ' ', text) + marker = "" enumerated = False if parent_list_label == GroupLabel.ORDERED_LIST: