From 025983f07bce1d3a6399ce8026653c8ce523b836 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 11 Oct 2024 11:18:47 +0200
Subject: [PATCH] Backend error handling fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/backend/docling_parse_backend.py |  2 +-
 docling/backend/html_backend.py          | 20 +++++++-------
 docling/backend/mspowerpoint_backend.py  |  8 +++---
 docling/backend/msword_backend.py        | 18 ++++++++-----
 docling/backend/pypdfium2_backend.py     |  2 +-
 docling/datamodel/document.py            | 34 +++++++++++++++++++-----
 examples/batch_convert.py                |  2 +-
 examples/run_with_formats.py             |  1 +
 8 files changed, 58 insertions(+), 29 deletions(-)
diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py
index 7d5c3113..e3e2293e 100644
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
 
         if not success:
             raise RuntimeError(
-                f"docling-parse could not load document {document_hash}."
+                f"docling-parse could not load document with hash {document_hash}."
             )
 
     def page_count(self) -> int:
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 2ab27095..cae81085 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
 
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        print("About to init HTML backend...")
+        _log.debug("About to init HTML backend...")
         super().__init__(path_or_stream, document_hash)
         self.soup = None
         # HTML file:
@@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
 
         try:
             if isinstance(self.path_or_stream, BytesIO):
-                text_stream = byte_stream.getvalue().decode("utf-8")
-                print(text_stream)
+                text_stream = self.path_or_stream.getvalue().decode("utf-8")
                 self.soup = BeautifulSoup(text_stream, "html.parser")
             if isinstance(self.path_or_stream, Path):
                 with open(self.path_or_stream, "r", encoding="utf-8") as f:
                     html_content = f.read()
                     self.soup = BeautifulSoup(html_content, "html.parser")
         except Exception as e:
-            _log.error("could not parse html: {}".format(e))
-            return doc
+            raise RuntimeError(
+                f"Could not initialize HTML backend for file with hash {document_hash}."
+            ) from e
 
     def is_valid(self) -> bool:
         return True
@@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def convert(self) -> DoclingDocument:
         # access self.path_or_stream to load stuff
         doc = DoclingDocument(description=DescriptionItem(), name="dummy")
-        print("Trying to convert HTML...")
+        _log.debug("Trying to convert HTML...")
         # Replace <br> tags with newline characters
         for br in self.soup.body.find_all("br"):
             br.replace_with("\n")
@@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def analyse_element(self, element, idx, doc):
         """
         if element.name!=None:
-            print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
+            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
         """
 
         if element.name in self.labels:
@@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
 
         doc.add_table(data=data, parent=self.parents[self.level])
 
-    def get_list_text(list_element, level=0):
+    def get_list_text(self, list_element, level=0):
         """Recursively extract text from <ul> or <ol> with proper indentation."""
         result = []
         bullet_char = "*"  # Default bullet character for unordered lists
@@ -335,7 +335,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 # Handle nested lists
                 nested_list = li.find(["ul", "ol"])
                 if nested_list:
-                    result.extend(get_list_text(nested_list, level + 1))
+                    result.extend(self.get_list_text(nested_list, level + 1))
         elif list_element.name == "ul":  # For unordered lists, use bullet points
             for li in list_element.find_all("li", recursive=False):
                 # Add bullet points for unordered lists
@@ -345,7 +345,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 # Handle nested lists
                 nested_list = li.find(["ul", "ol"])
                 if nested_list:
-                    result.extend(get_list_text(nested_list, level + 1))
+                    result.extend(self.get_list_text(nested_list, level + 1))
 
         return result
 
diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index f5a635ee..7703e3b1 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -39,12 +39,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         self.path_or_stream = path_or_stream
 
         self.pptx_obj = None
-        self.valid = True
+        self.valid = False
         try:
             self.pptx_obj = Presentation(self.path_or_stream)
+            self.valid = True
         except Exception:
-            _log.error("could not parse pptx")
-            self.valid = False
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
+            ) from e
 
         return
 
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index fbde6869..49911009 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -34,6 +34,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         # self.initialise(path_or_stream)
         # Word file:
         self.path_or_stream = path_or_stream
+        self.valid = False
         # Initialise the parents for the hierarchy
         self.max_levels = 10
         self.level_at_new_list = None
@@ -50,6 +51,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             "indents": [None],
         }
 
+        self.docx_obj = None
+        try:
+            self.docx_obj = docx.Document(self.path_or_stream)
+            self.valid = True
+        except Exception as e:
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
+            ) from e
+
     def is_valid(self) -> bool:
         return True
 
@@ -69,15 +79,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def convert(self) -> DoclingDocument:
         # Parses the DOCX into a structured document model.
         doc = DoclingDocument(description=DescriptionItem(), name="dummy")
-        docx_obj = None
-        try:
-            docx_obj = docx.Document(self.path_or_stream)
-        except Exception:
-            _log.error("could not parse docx")
-            return doc
 
         # self.initialise()
-        doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
+        doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
         return doc
 
     def update_history(self, name, level, numid, ilevel):
diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py
index e7d7ae84..e4c6e423 100644
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -238,7 +238,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
             self._pdoc = pdfium.PdfDocument(path_or_stream)
         except PdfiumError as e:
             raise RuntimeError(
-                f"pypdfium could not load document {document_hash}"
+                f"pypdfium could not load document with hash {document_hash}"
             ) from e
 
     def page_count(self) -> int:
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index b7c020f2..721ccfcd 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -1,4 +1,5 @@
 import logging
+import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
@@ -497,19 +498,40 @@ class DocumentConversionInput(BaseModel):
                 )
 
     def _guess_format(self, obj):
+        content = None
         if isinstance(obj, Path):
             mime = filetype.guess_mime(str(obj))
-        elif isinstance(obj, DocumentStream):
-            mime = filetype.guess_mime(obj.stream.read(8192))
-        if mime is None:
-            # TODO improve this.
+            if mime is None:
+                with obj.open("rb") as f:
+                    content = f.read(1024)  # Read first 1KB
 
-            if obj.suffix == ".html":
-                mime = "text/html"
+        elif isinstance(obj, DocumentStream):
+            obj.stream.seek(0)
+            content = obj.stream.read(8192)
+            obj.stream.seek(0)
+            mime = filetype.guess_mime(content)
+
+        if mime is None:
+            mime = self._detect_html_xhtml(content)
 
         format = MimeTypeToFormat.get(mime)
         return format
 
+    def _detect_html_xhtml(self, content):
+        content_str = content.decode("ascii", errors="ignore").lower()
+        # Remove XML comments
+        content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
+        content_str = content_str.lstrip()
+
+        if re.match(r"<\?xml", content_str):
+            if "xhtml" in content_str[:1000]:
+                return "application/xhtml+xml"
+
+        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
+            return "text/html"
+
+        return None
+
     @classmethod
     def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
         paths = [Path(p) for p in paths]
diff --git a/examples/batch_convert.py b/examples/batch_convert.py
index ca4988f3..e54193f0 100644
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@@ -120,7 +120,7 @@ def main():
     ]
 
     # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
-    # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
+    # docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
     # input = DocumentConversionInput.from_streams(docs)
 
     doc_converter = DocumentConverter()
diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py
index 37d49e1c..398121bd 100644
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -38,6 +38,7 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
         InputFormat.PDF,
         # InputFormat.IMAGE,
         InputFormat.DOCX,
+        InputFormat.HTML,
     ],  # whitelist formats, other files are ignored.
     format_options={
         InputFormat.PDF: PdfFormatOption(