From 07d952acf97ecab216fc397d3a810f641c8a07f2 Mon Sep 17 00:00:00 2001
From: Maxim Lysak <mly@zurich.ibm.com>
Date: Tue, 8 Oct 2024 16:37:47 +0200
Subject: [PATCH] Improved backends

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
---
 docling/backend/html_backend.py         | 26 +++++++++++++++----------
 docling/backend/mspowerpoint_backend.py |  6 +++++-
 examples/run_with_formats.py            | 12 ++++++++++++
 3 files changed, 33 insertions(+), 11 deletions(-)
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index c0315aaf..bd6aff56 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,5 +1,5 @@
 import logging
-from io import BytesIO
+from io import BytesIO, TextIOWrapper
 from pathlib import Path
 from typing import Set, Union
 
@@ -27,6 +27,7 @@ _log = logging.getLogger(__name__)
 
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        print("About to init HTML backend...")
         super().__init__(path_or_stream, document_hash)
         self.soup = None
         # HTML file:
@@ -39,6 +40,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.parents[i] = None
         self.labels = {}
 
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                text_stream = byte_stream.getvalue().decode("utf-8")
+                print(text_stream)
+                self.soup = BeautifulSoup(text_stream, "html.parser")
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                    html_content = f.read()
+                    self.soup = BeautifulSoup(html_content, "html.parser")
+        except Exception as e:
+            _log.error("could not parse html: {}".format(e))
+            return doc
+
     def is_valid(self) -> bool:
         return True
 
@@ -58,15 +72,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def convert(self) -> DoclingDocument:
         # access self.path_or_stream to load stuff
         doc = DoclingDocument(description=DescriptionItem(), name="dummy")
-
-        try:
-            with open(self.path_or_stream, "r", encoding="utf-8") as f:
-                html_content = f.read()
-                self.soup = BeautifulSoup(html_content, "html.parser")
-        except Exception as e:
-            _log.error("could not parse html: {}".format(e))
-            return doc
-
+        print("Trying to convert HTML...")
         # Replace <br> tags with newline characters
         for br in self.soup.body.find_all("br"):
             br.replace_with("\n")
diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index 49027356..16e0de69 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -78,8 +78,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         # Parses the PPTX into a structured document model.
         # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
 
+        fname = ""
+        if isinstance(self.path_or_stream, Path):
+            fname = self.path_or_stream.name
+
         origin = DocumentOrigin(
-            filename=self.path_or_stream.name,
+            filename=fname,
             mimetype="application/vnd.ms-powerpoint",
             binary_hash=self.document_hash,
         )
diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py
index f143c9f8..573c58b1 100644
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -1,5 +1,6 @@
 import json
 import logging
+from io import BytesIO, TextIOWrapper
 from pathlib import Path
 from typing import Iterable
 
@@ -9,6 +10,7 @@ from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
+    DocumentStream,
     InputFormat,
     PdfPipelineOptions,
     PipelineOptions,
@@ -29,6 +31,16 @@ input_paths = [
     Path("tests/data/powerpoint_sample.pptx"),
     Path("tests/data/2206.01062.pdf"),
 ]
+
+input_bytes = []
+for p in input_paths:
+    buf = BytesIO(p.open("rb").read())
+    # tstream = TextIOWrapper(buf, encoding='utf-8')
+    # input_bytes.append(tstream)
+    bstream = DocumentStream(filename=p.name, stream=buf)
+    input_bytes.append(bstream)
+
+# input = DocumentConversionInput.from_streams(input_bytes)
 input = DocumentConversionInput.from_paths(input_paths)
 
 # for defaults use: