mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-03 07:52:20 +00:00
chore: add safe initialization of PatentUsptoDocumentBackend
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
89c84ff749
commit
94735ec9c4
0
docling/backend/xml/__init__.py
Normal file
0
docling/backend/xml/__init__.py
Normal file
@ -67,6 +67,7 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.patent_content: str = ""
|
self.patent_content: str = ""
|
||||||
self.parser: Optional[PatentUspto] = None
|
self.parser: Optional[PatentUspto] = None
|
||||||
|
|
||||||
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
while line := self.path_or_stream.readline().decode("utf-8"):
|
while line := self.path_or_stream.readline().decode("utf-8"):
|
||||||
if line.startswith("<!DOCTYPE") or line == "PATN\n":
|
if line.startswith("<!DOCTYPE") or line == "PATN\n":
|
||||||
@ -78,6 +79,10 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if line.startswith("<!DOCTYPE") or line == "PATN\n":
|
if line.startswith("<!DOCTYPE") or line == "PATN\n":
|
||||||
self._set_parser(line)
|
self._set_parser(line)
|
||||||
self.patent_content += line
|
self.patent_content += line
|
||||||
|
except Exception as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Could not initialize USPTO backend for file with hash {self.document_hash}."
|
||||||
|
) from exc
|
||||||
|
|
||||||
def _set_parser(self, doctype: str) -> None:
|
def _set_parser(self, doctype: str) -> None:
|
||||||
doctype_line = doctype.lower()
|
doctype_line = doctype.lower()
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from enum import Enum, auto
|
from enum import Enum
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
@ -13,7 +13,6 @@ from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from
|
|||||||
)
|
)
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict
|
||||||
from typing_extensions import Self, override
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.backend.pdf_backend import PdfPageBackend
|
from docling.backend.pdf_backend import PdfPageBackend
|
||||||
@ -29,31 +28,17 @@ class ConversionStatus(str, Enum):
|
|||||||
|
|
||||||
|
|
||||||
class InputFormat(str, Enum):
|
class InputFormat(str, Enum):
|
||||||
"""A document format supported by document backend parsers.
|
"""A document format supported by document backend parsers."""
|
||||||
|
|
||||||
The field `is_custom` indicates whether the document format is more specific than
|
DOCX = "docx"
|
||||||
the standard and content formats, typically defined by MIME types.
|
PPTX = "pptx"
|
||||||
"""
|
HTML = "html"
|
||||||
|
IMAGE = "image"
|
||||||
DOCX = "docx", False
|
PDF = "pdf"
|
||||||
PPTX = "pptx", False
|
ASCIIDOC = "asciidoc"
|
||||||
HTML = "html", False
|
MD = "md"
|
||||||
IMAGE = "image", False
|
XLSX = "xlsx"
|
||||||
PDF = "pdf", False
|
XML_USPTO = "xml_uspto"
|
||||||
ASCIIDOC = "asciidoc", False
|
|
||||||
MD = "md", False
|
|
||||||
XLSX = "xlsx", False
|
|
||||||
XML_USPTO = "xml_uspto", True
|
|
||||||
|
|
||||||
@override
|
|
||||||
def __new__(cls, value: str, _) -> Self:
|
|
||||||
obj = str.__new__(cls, [value])
|
|
||||||
obj._value_ = value
|
|
||||||
return obj
|
|
||||||
|
|
||||||
@override
|
|
||||||
def __init__(self, _, is_custom: bool) -> None:
|
|
||||||
self.is_custom: bool = is_custom
|
|
||||||
|
|
||||||
|
|
||||||
class OutputFormat(str, Enum):
|
class OutputFormat(str, Enum):
|
||||||
|
@ -290,9 +290,10 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
mime = mime or "text/plain"
|
mime = mime or "text/plain"
|
||||||
formats = MimeTypeToFormat.get(mime, [])
|
formats = MimeTypeToFormat.get(mime, [])
|
||||||
if formats:
|
if formats:
|
||||||
if len(formats) == 1 and not formats[0].is_custom:
|
# TODO: remove application/xml case after adding another XML parse
|
||||||
|
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
|
||||||
return formats[0]
|
return formats[0]
|
||||||
else: # ambiguity or custom cases
|
else: # ambiguity in formats
|
||||||
return _DocumentConversionInput._guess_from_content(
|
return _DocumentConversionInput._guess_from_content(
|
||||||
content, mime, formats
|
content, mime, formats
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user