diff --git a/docling/backend/xml/__init__.py b/docling/backend/xml/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py index 7523b670..ef253b21 100644 --- a/docling/backend/xml/uspto_backend.py +++ b/docling/backend/xml/uspto_backend.py @@ -67,17 +67,22 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend): self.patent_content: str = "" self.parser: Optional[PatentUspto] = None - if isinstance(self.path_or_stream, BytesIO): - while line := self.path_or_stream.readline().decode("utf-8"): - if line.startswith(" None: doctype_line = doctype.lower() diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 1a073c63..bbd1a3ed 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -1,4 +1,4 @@ -from enum import Enum, auto +from enum import Enum from typing import TYPE_CHECKING, Dict, List, Optional, Union from docling_core.types.doc import ( @@ -13,7 +13,6 @@ from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from ) from PIL.Image import Image from pydantic import BaseModel, ConfigDict -from typing_extensions import Self, override if TYPE_CHECKING: from docling.backend.pdf_backend import PdfPageBackend @@ -29,31 +28,17 @@ class ConversionStatus(str, Enum): class InputFormat(str, Enum): - """A document format supported by document backend parsers. + """A document format supported by document backend parsers.""" - The field `is_custom` indicates whether the document format is more specific than - the standard and content formats, typically defined by MIME types. - """ - - DOCX = "docx", False - PPTX = "pptx", False - HTML = "html", False - IMAGE = "image", False - PDF = "pdf", False - ASCIIDOC = "asciidoc", False - MD = "md", False - XLSX = "xlsx", False - XML_USPTO = "xml_uspto", True - - @override - def __new__(cls, value: str, _) -> Self: - obj = str.__new__(cls, [value]) - obj._value_ = value - return obj - - @override - def __init__(self, _, is_custom: bool) -> None: - self.is_custom: bool = is_custom + DOCX = "docx" + PPTX = "pptx" + HTML = "html" + IMAGE = "image" + PDF = "pdf" + ASCIIDOC = "asciidoc" + MD = "md" + XLSX = "xlsx" + XML_USPTO = "xml_uspto" class OutputFormat(str, Enum): diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 1be71294..5f6e5ec3 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -290,9 +290,10 @@ class _DocumentConversionInput(BaseModel): mime = mime or "text/plain" formats = MimeTypeToFormat.get(mime, []) if formats: - if len(formats) == 1 and not formats[0].is_custom: + # TODO: remove application/xml case after adding another XML parse + if len(formats) == 1 and mime not in ("text/plain", "application/xml"): return formats[0] - else: # ambiguity or custom cases + else: # ambiguity in formats return _DocumentConversionInput._guess_from_content( content, mime, formats )