chore: add safe initialization of PatentUsptoDocumentBackend

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-08-01 15:02:21 +00:00 · 2024-12-17 13:42:44 +01:00 · 2024-12-17 13:42:44 +01:00 · 94735ec9c4
commit 94735ec9c4
parent 89c84ff749
4 changed files with 27 additions and 36 deletions
--- a/docling/backend/xml/init.py
+++ b/docling/backend/xml/init.py
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@ -67,17 +67,22 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
        self.patent_content: str = ""
        self.parser: Optional[PatentUspto] = None

-        if isinstance(self.path_or_stream, BytesIO):
-            while line := self.path_or_stream.readline().decode("utf-8"):
-                if line.startswith("<!DOCTYPE") or line == "PATN\n":
-                    self._set_parser(line)
-                self.patent_content += line
-        elif isinstance(self.path_or_stream, Path):
-            with open(self.path_or_stream, encoding="utf-8") as file_obj:
-                while line := file_obj.readline():
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                while line := self.path_or_stream.readline().decode("utf-8"):
                    if line.startswith("<!DOCTYPE") or line == "PATN\n":
                        self._set_parser(line)
                    self.patent_content += line
+            elif isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, encoding="utf-8") as file_obj:
+                    while line := file_obj.readline():
+                        if line.startswith("<!DOCTYPE") or line == "PATN\n":
+                            self._set_parser(line)
+                        self.patent_content += line
+        except Exception as exc:
+            raise RuntimeError(
+                f"Could not initialize USPTO backend for file with hash {self.document_hash}."
+            ) from exc

    def _set_parser(self, doctype: str) -> None:
        doctype_line = doctype.lower()
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -1,4 +1,4 @@
-from enum import Enum, auto
+from enum import Enum
 from typing import TYPE_CHECKING, Dict, List, Optional, Union

 from docling_core.types.doc import (
@ -13,7 +13,6 @@ from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from
 )
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict
-from typing_extensions import Self, override

 if TYPE_CHECKING:
    from docling.backend.pdf_backend import PdfPageBackend
@ -29,31 +28,17 @@ class ConversionStatus(str, Enum):


 class InputFormat(str, Enum):
-    """A document format supported by document backend parsers.
+    """A document format supported by document backend parsers."""

-    The field `is_custom` indicates whether the document format is more specific than
-    the standard and content formats, typically defined by MIME types.
-    """
-
-    DOCX = "docx", False
-    PPTX = "pptx", False
-    HTML = "html", False
-    IMAGE = "image", False
-    PDF = "pdf", False
-    ASCIIDOC = "asciidoc", False
-    MD = "md", False
-    XLSX = "xlsx", False
-    XML_USPTO = "xml_uspto", True
-
-    @override
-    def __new__(cls, value: str, _) -> Self:
-        obj = str.__new__(cls, [value])
-        obj._value_ = value
-        return obj
-
-    @override
-    def __init__(self, _, is_custom: bool) -> None:
-        self.is_custom: bool = is_custom
+    DOCX = "docx"
+    PPTX = "pptx"
+    HTML = "html"
+    IMAGE = "image"
+    PDF = "pdf"
+    ASCIIDOC = "asciidoc"
+    MD = "md"
+    XLSX = "xlsx"
+    XML_USPTO = "xml_uspto"


 class OutputFormat(str, Enum):
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -290,9 +290,10 @@ class _DocumentConversionInput(BaseModel):
        mime = mime or "text/plain"
        formats = MimeTypeToFormat.get(mime, [])
        if formats:
-            if len(formats) == 1 and not formats[0].is_custom:
+            # TODO: remove application/xml case after adding another XML parse
+            if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
                return formats[0]
-            else:  # ambiguity or custom cases
+            else:  # ambiguity in formats
                return _DocumentConversionInput._guess_from_content(
                    content, mime, formats
                )