chore: add safe initialization of PatentUsptoDocumentBackend

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2024-12-17 13:42:44 +01:00
parent 89c84ff749
commit 94735ec9c4
4 changed files with 27 additions and 36 deletions

View File

View File

@ -67,17 +67,22 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
self.patent_content: str = ""
self.parser: Optional[PatentUspto] = None
if isinstance(self.path_or_stream, BytesIO):
while line := self.path_or_stream.readline().decode("utf-8"):
if line.startswith("<!DOCTYPE") or line == "PATN\n":
self._set_parser(line)
self.patent_content += line
elif isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as file_obj:
while line := file_obj.readline():
try:
if isinstance(self.path_or_stream, BytesIO):
while line := self.path_or_stream.readline().decode("utf-8"):
if line.startswith("<!DOCTYPE") or line == "PATN\n":
self._set_parser(line)
self.patent_content += line
elif isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as file_obj:
while line := file_obj.readline():
if line.startswith("<!DOCTYPE") or line == "PATN\n":
self._set_parser(line)
self.patent_content += line
except Exception as exc:
raise RuntimeError(
f"Could not initialize USPTO backend for file with hash {self.document_hash}."
) from exc
def _set_parser(self, doctype: str) -> None:
doctype_line = doctype.lower()

View File

@ -1,4 +1,4 @@
from enum import Enum, auto
from enum import Enum
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from docling_core.types.doc import (
@ -13,7 +13,6 @@ from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from
)
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict
from typing_extensions import Self, override
if TYPE_CHECKING:
from docling.backend.pdf_backend import PdfPageBackend
@ -29,31 +28,17 @@ class ConversionStatus(str, Enum):
class InputFormat(str, Enum):
"""A document format supported by document backend parsers.
"""A document format supported by document backend parsers."""
The field `is_custom` indicates whether the document format is more specific than
the standard and content formats, typically defined by MIME types.
"""
DOCX = "docx", False
PPTX = "pptx", False
HTML = "html", False
IMAGE = "image", False
PDF = "pdf", False
ASCIIDOC = "asciidoc", False
MD = "md", False
XLSX = "xlsx", False
XML_USPTO = "xml_uspto", True
@override
def __new__(cls, value: str, _) -> Self:
obj = str.__new__(cls, [value])
obj._value_ = value
return obj
@override
def __init__(self, _, is_custom: bool) -> None:
self.is_custom: bool = is_custom
DOCX = "docx"
PPTX = "pptx"
HTML = "html"
IMAGE = "image"
PDF = "pdf"
ASCIIDOC = "asciidoc"
MD = "md"
XLSX = "xlsx"
XML_USPTO = "xml_uspto"
class OutputFormat(str, Enum):

View File

@ -290,9 +290,10 @@ class _DocumentConversionInput(BaseModel):
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
if formats:
if len(formats) == 1 and not formats[0].is_custom:
# TODO: remove application/xml case after adding another XML parse
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
return formats[0]
else: # ambiguity or custom cases
else: # ambiguity in formats
return _DocumentConversionInput._guess_from_content(
content, mime, formats
)