Merge from upstream

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-13 07:08:19 +00:00 · 2024-10-08 16:40:55 +02:00
parent 203cf19b1b 07d952acf9
commit 080042d06d
3 changed files with 21 additions and 12 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,5 +1,5 @@
 import logging
-from io import BytesIO
+from io import BytesIO, TextIOWrapper
 from pathlib import Path
 from typing import Set, Union

@@ -21,6 +21,7 @@ _log = logging.getLogger(__name__)

 class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        print("About to init HTML backend...")
        super().__init__(path_or_stream, document_hash)
        self.soup = None
        # HTML file:
@@ -33,6 +34,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            self.parents[i] = None
        self.labels = {}

+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                text_stream = byte_stream.getvalue().decode("utf-8")
+                print(text_stream)
+                self.soup = BeautifulSoup(text_stream, "html.parser")
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                    html_content = f.read()
+                    self.soup = BeautifulSoup(html_content, "html.parser")
+        except Exception as e:
+            _log.error("could not parse html: {}".format(e))
+            return doc
+
    def is_valid(self) -> bool:
        return True

@@ -52,15 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
-
-        try:
-            with open(self.path_or_stream, "r", encoding="utf-8") as f:
-                html_content = f.read()
-                self.soup = BeautifulSoup(html_content, "html.parser")
-        except Exception as e:
-            _log.error("could not parse html: {}".format(e))
-            return doc
-
+        print("Trying to convert HTML...")
        # Replace <br> tags with newline characters
        for br in self.soup.body.find_all("br"):
            br.replace_with("\n")
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -74,8 +74,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        # Parses the PPTX into a structured document model.
        # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)

+        fname = ""
+        if isinstance(self.path_or_stream, Path):
+            fname = self.path_or_stream.name
+
        origin = DocumentOrigin(
-            filename=self.path_or_stream.name,
+            filename=fname,
            mimetype="application/vnd.ms-powerpoint",
            binary_hash=self.document_hash,
        )
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -18,7 +18,6 @@ input_paths = [
    Path("tests/data/word_sample.docx"),
    Path("tests/data/lorem_ipsum.docx"),
    Path("tests/data/powerpoint_sample.pptx"),
-    Path("tests/data/powerpoint_sample.pptx"),
    Path("tests/data/2206.01062.pdf"),
 ]
 input = DocumentConversionInput.from_paths(input_paths)