Backend error handling fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-11 14:18:30 +00:00 · 2024-10-11 11:18:47 +02:00
parent 304d16029a
commit 025983f07b
8 changed files with 58 additions and 29 deletions
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):

        if not success:
            raise RuntimeError(
-                f"docling-parse could not load document {document_hash}."
+                f"docling-parse could not load document with hash {document_hash}."
            )

    def page_count(self) -> int:
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)

 class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        print("About to init HTML backend...")
+        _log.debug("About to init HTML backend...")
        super().__init__(path_or_stream, document_hash)
        self.soup = None
        # HTML file:
@@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

        try:
            if isinstance(self.path_or_stream, BytesIO):
-                text_stream = byte_stream.getvalue().decode("utf-8")
-                print(text_stream)
+                text_stream = self.path_or_stream.getvalue().decode("utf-8")
                self.soup = BeautifulSoup(text_stream, "html.parser")
            if isinstance(self.path_or_stream, Path):
                with open(self.path_or_stream, "r", encoding="utf-8") as f:
                    html_content = f.read()
                    self.soup = BeautifulSoup(html_content, "html.parser")
        except Exception as e:
-            _log.error("could not parse html: {}".format(e))
-            return doc
+            raise RuntimeError(
+                f"Could not initialize HTML backend for file with hash {document_hash}."
+            ) from e

    def is_valid(self) -> bool:
        return True
@@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
-        print("Trying to convert HTML...")
+        _log.debug("Trying to convert HTML...")
        # Replace <br> tags with newline characters
        for br in self.soup.body.find_all("br"):
            br.replace_with("\n")
@@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def analyse_element(self, element, idx, doc):
        """
        if element.name!=None:
-            print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
+            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
        """

        if element.name in self.labels:
@@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

        doc.add_table(data=data, parent=self.parents[self.level])

-    def get_list_text(list_element, level=0):
+    def get_list_text(self, list_element, level=0):
        """Recursively extract text from <ul> or <ol> with proper indentation."""
        result = []
        bullet_char = "*"  # Default bullet character for unordered lists
@@ -335,7 +335,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                # Handle nested lists
                nested_list = li.find(["ul", "ol"])
                if nested_list:
-                    result.extend(get_list_text(nested_list, level + 1))
+                    result.extend(self.get_list_text(nested_list, level + 1))
        elif list_element.name == "ul":  # For unordered lists, use bullet points
            for li in list_element.find_all("li", recursive=False):
                # Add bullet points for unordered lists
@@ -345,7 +345,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                # Handle nested lists
                nested_list = li.find(["ul", "ol"])
                if nested_list:
-                    result.extend(get_list_text(nested_list, level + 1))
+                    result.extend(self.get_list_text(nested_list, level + 1))

        return result

--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -39,12 +39,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        self.path_or_stream = path_or_stream

        self.pptx_obj = None
-        self.valid = True
+        self.valid = False
        try:
            self.pptx_obj = Presentation(self.path_or_stream)
+            self.valid = True
        except Exception:
-            _log.error("could not parse pptx")
-            self.valid = False
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
+            ) from e

        return

--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -34,6 +34,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        # self.initialise(path_or_stream)
        # Word file:
        self.path_or_stream = path_or_stream
+        self.valid = False
        # Initialise the parents for the hierarchy
        self.max_levels = 10
        self.level_at_new_list = None
@@ -50,6 +51,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            "indents": [None],
        }

+        self.docx_obj = None
+        try:
+            self.docx_obj = docx.Document(self.path_or_stream)
+            self.valid = True
+        except Exception as e:
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
+            ) from e
+
    def is_valid(self) -> bool:
        return True

@@ -69,15 +79,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        # Parses the DOCX into a structured document model.
        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
-        docx_obj = None
-        try:
-            docx_obj = docx.Document(self.path_or_stream)
-        except Exception:
-            _log.error("could not parse docx")
-            return doc

        # self.initialise()
-        doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
+        doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
        return doc

    def update_history(self, name, level, numid, ilevel):
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -238,7 +238,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
            self._pdoc = pdfium.PdfDocument(path_or_stream)
        except PdfiumError as e:
            raise RuntimeError(
-                f"pypdfium could not load document {document_hash}"
+                f"pypdfium could not load document with hash {document_hash}"
            ) from e

    def page_count(self) -> int:
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -1,4 +1,5 @@
 import logging
+import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
@@ -497,19 +498,40 @@ class DocumentConversionInput(BaseModel):
                )

    def _guess_format(self, obj):
+        content = None
        if isinstance(obj, Path):
            mime = filetype.guess_mime(str(obj))
-        elif isinstance(obj, DocumentStream):
-            mime = filetype.guess_mime(obj.stream.read(8192))
-        if mime is None:
-            # TODO improve this.
+            if mime is None:
+                with obj.open("rb") as f:
+                    content = f.read(1024)  # Read first 1KB

-            if obj.suffix == ".html":
-                mime = "text/html"
+        elif isinstance(obj, DocumentStream):
+            obj.stream.seek(0)
+            content = obj.stream.read(8192)
+            obj.stream.seek(0)
+            mime = filetype.guess_mime(content)
+
+        if mime is None:
+            mime = self._detect_html_xhtml(content)

        format = MimeTypeToFormat.get(mime)
        return format

+    def _detect_html_xhtml(self, content):
+        content_str = content.decode("ascii", errors="ignore").lower()
+        # Remove XML comments
+        content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
+        content_str = content_str.lstrip()
+
+        if re.match(r"<\?xml", content_str):
+            if "xhtml" in content_str[:1000]:
+                return "application/xhtml+xml"
+
+        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
+            return "text/html"
+
+        return None
+
    @classmethod
    def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
        paths = [Path(p) for p in paths]
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@@ -120,7 +120,7 @@ def main():
    ]

    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
-    # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
+    # docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
    # input = DocumentConversionInput.from_streams(docs)

    doc_converter = DocumentConverter()
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -38,6 +38,7 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
        InputFormat.PDF,
        # InputFormat.IMAGE,
        InputFormat.DOCX,
+        InputFormat.HTML,
    ],  # whitelist formats, other files are ignored.
    format_options={
        InputFormat.PDF: PdfFormatOption(