chore: add safe initialization of PatentUsptoDocumentBackend

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-08-03 07:52:20 +00:00 · 2024-12-17 13:42:44 +01:00 · 2024-12-17 13:42:44 +01:00 · 94735ec9c4
commit 94735ec9c4
parent 89c84ff749
4 changed files with 27 additions and 36 deletions
--- a/docling/backend/xml/init.py
+++ b/docling/backend/xml/init.py
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@ -67,6 +67,7 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
        self.patent_content: str = ""
        self.parser: Optional[PatentUspto] = None
        try:
            if isinstance(self.path_or_stream, BytesIO):
                while line := self.path_or_stream.readline().decode("utf-8"):
                    if line.startswith("<!DOCTYPE") or line == "PATN\n":
@ -78,6 +79,10 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
                        if line.startswith("<!DOCTYPE") or line == "PATN\n":
                            self._set_parser(line)
                        self.patent_content += line
        except Exception as exc:
            raise RuntimeError(
                f"Could not initialize USPTO backend for file with hash {self.document_hash}."
            ) from exc
    def _set_parser(self, doctype: str) -> None:
        doctype_line = doctype.lower()
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -1,4 +1,4 @@
-from enum import Enum, auto
+from enum import Enum
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 from docling_core.types.doc import (
@ -13,7 +13,6 @@ from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from
 )
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict
 from typing_extensions import Self, override
 if TYPE_CHECKING:
    from docling.backend.pdf_backend import PdfPageBackend
@ -29,31 +28,17 @@ class ConversionStatus(str, Enum):
 class InputFormat(str, Enum):
-    """A document format supported by document backend parsers.
+    """A document format supported by document backend parsers."""
-    The field `is_custom` indicates whether the document format is more specific than
+    DOCX = "docx"
-    the standard and content formats, typically defined by MIME types.
+    PPTX = "pptx"
-    """
+    HTML = "html"
-
+    IMAGE = "image"
-    DOCX = "docx", False
+    PDF = "pdf"
-    PPTX = "pptx", False
+    ASCIIDOC = "asciidoc"
-    HTML = "html", False
+    MD = "md"
-    IMAGE = "image", False
+    XLSX = "xlsx"
-    PDF = "pdf", False
+    XML_USPTO = "xml_uspto"
    ASCIIDOC = "asciidoc", False
    MD = "md", False
    XLSX = "xlsx", False
    XML_USPTO = "xml_uspto", True
    @override
    def __new__(cls, value: str, _) -> Self:
        obj = str.__new__(cls, [value])
        obj._value_ = value
        return obj
    @override
    def __init__(self, _, is_custom: bool) -> None:
        self.is_custom: bool = is_custom
 class OutputFormat(str, Enum):
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -290,9 +290,10 @@ class _DocumentConversionInput(BaseModel):
        mime = mime or "text/plain"
        formats = MimeTypeToFormat.get(mime, [])
        if formats:
-            if len(formats) == 1 and not formats[0].is_custom:
+            # TODO: remove application/xml case after adding another XML parse
            if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
                return formats[0]
-            else:  # ambiguity or custom cases
+            else:  # ambiguity in formats
                return _DocumentConversionInput._guess_from_content(
                    content, mime, formats
                )