diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 36e02dfe..dc873971 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,5 +1,5 @@
import logging
-from io import BytesIO
+from io import BytesIO, TextIOWrapper
from pathlib import Path
from typing import Set, Union
@@ -21,6 +21,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+ print("About to init HTML backend...")
super().__init__(path_or_stream, document_hash)
self.soup = None
# HTML file:
@@ -33,6 +34,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[i] = None
self.labels = {}
+ try:
+ if isinstance(self.path_or_stream, BytesIO):
+ text_stream = byte_stream.getvalue().decode("utf-8")
+ print(text_stream)
+ self.soup = BeautifulSoup(text_stream, "html.parser")
+ if isinstance(self.path_or_stream, Path):
+ with open(self.path_or_stream, "r", encoding="utf-8") as f:
+ html_content = f.read()
+ self.soup = BeautifulSoup(html_content, "html.parser")
+ except Exception as e:
+ _log.error("could not parse html: {}".format(e))
+ return doc
+
def is_valid(self) -> bool:
return True
@@ -52,15 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
-
- try:
- with open(self.path_or_stream, "r", encoding="utf-8") as f:
- html_content = f.read()
- self.soup = BeautifulSoup(html_content, "html.parser")
- except Exception as e:
- _log.error("could not parse html: {}".format(e))
- return doc
-
+ print("Trying to convert HTML...")
# Replace
tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")
diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index 91be4fd8..22eae9d4 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -74,8 +74,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
# Parses the PPTX into a structured document model.
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
+ fname = ""
+ if isinstance(self.path_or_stream, Path):
+ fname = self.path_or_stream.name
+
origin = DocumentOrigin(
- filename=self.path_or_stream.name,
+ filename=fname,
mimetype="application/vnd.ms-powerpoint",
binary_hash=self.document_hash,
)
diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py
index cdf1c670..aeb254a9 100644
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -18,7 +18,6 @@ input_paths = [
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
- Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2206.01062.pdf"),
]
input = DocumentConversionInput.from_paths(input_paths)