mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
chore: add safe initialization of PatentUsptoDocumentBackend
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
89c84ff749
commit
94735ec9c4
0
docling/backend/xml/__init__.py
Normal file
0
docling/backend/xml/__init__.py
Normal file
@ -67,17 +67,22 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.patent_content: str = ""
|
||||
self.parser: Optional[PatentUspto] = None
|
||||
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
while line := self.path_or_stream.readline().decode("utf-8"):
|
||||
if line.startswith("<!DOCTYPE") or line == "PATN\n":
|
||||
self._set_parser(line)
|
||||
self.patent_content += line
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, encoding="utf-8") as file_obj:
|
||||
while line := file_obj.readline():
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
while line := self.path_or_stream.readline().decode("utf-8"):
|
||||
if line.startswith("<!DOCTYPE") or line == "PATN\n":
|
||||
self._set_parser(line)
|
||||
self.patent_content += line
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, encoding="utf-8") as file_obj:
|
||||
while line := file_obj.readline():
|
||||
if line.startswith("<!DOCTYPE") or line == "PATN\n":
|
||||
self._set_parser(line)
|
||||
self.patent_content += line
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
f"Could not initialize USPTO backend for file with hash {self.document_hash}."
|
||||
) from exc
|
||||
|
||||
def _set_parser(self, doctype: str) -> None:
|
||||
doctype_line = doctype.lower()
|
||||
|
@ -1,4 +1,4 @@
|
||||
from enum import Enum, auto
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
@ -13,7 +13,6 @@ from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from
|
||||
)
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from typing_extensions import Self, override
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.backend.pdf_backend import PdfPageBackend
|
||||
@ -29,31 +28,17 @@ class ConversionStatus(str, Enum):
|
||||
|
||||
|
||||
class InputFormat(str, Enum):
|
||||
"""A document format supported by document backend parsers.
|
||||
"""A document format supported by document backend parsers."""
|
||||
|
||||
The field `is_custom` indicates whether the document format is more specific than
|
||||
the standard and content formats, typically defined by MIME types.
|
||||
"""
|
||||
|
||||
DOCX = "docx", False
|
||||
PPTX = "pptx", False
|
||||
HTML = "html", False
|
||||
IMAGE = "image", False
|
||||
PDF = "pdf", False
|
||||
ASCIIDOC = "asciidoc", False
|
||||
MD = "md", False
|
||||
XLSX = "xlsx", False
|
||||
XML_USPTO = "xml_uspto", True
|
||||
|
||||
@override
|
||||
def __new__(cls, value: str, _) -> Self:
|
||||
obj = str.__new__(cls, [value])
|
||||
obj._value_ = value
|
||||
return obj
|
||||
|
||||
@override
|
||||
def __init__(self, _, is_custom: bool) -> None:
|
||||
self.is_custom: bool = is_custom
|
||||
DOCX = "docx"
|
||||
PPTX = "pptx"
|
||||
HTML = "html"
|
||||
IMAGE = "image"
|
||||
PDF = "pdf"
|
||||
ASCIIDOC = "asciidoc"
|
||||
MD = "md"
|
||||
XLSX = "xlsx"
|
||||
XML_USPTO = "xml_uspto"
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
|
@ -290,9 +290,10 @@ class _DocumentConversionInput(BaseModel):
|
||||
mime = mime or "text/plain"
|
||||
formats = MimeTypeToFormat.get(mime, [])
|
||||
if formats:
|
||||
if len(formats) == 1 and not formats[0].is_custom:
|
||||
# TODO: remove application/xml case after adding another XML parse
|
||||
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
|
||||
return formats[0]
|
||||
else: # ambiguity or custom cases
|
||||
else: # ambiguity in formats
|
||||
return _DocumentConversionInput._guess_from_content(
|
||||
content, mime, formats
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user