diff --git a/docling/backend/xml_uspto_backend.py b/docling/backend/xml_uspto_backend.py index bcbe25be..7523b670 100644 --- a/docling/backend/xml_uspto_backend.py +++ b/docling/backend/xml_uspto_backend.py @@ -89,7 +89,10 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend): self.parser = PatentUsptoIce() elif "us-grant-025" in doctype_line: self.parser = PatentUsptoGrantV2() - elif "pap-v1" in doctype_line: + elif all( + item in doctype_line + for item in ("patent-application-publication", "pap-v1") + ): self.parser = PatentUsptoAppV1() else: self.parser = None diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 4f73286f..257533f1 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -13,6 +13,7 @@ from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from ) from PIL.Image import Image from pydantic import BaseModel, ConfigDict +from typing_extensions import Self, override if TYPE_CHECKING: from docling.backend.pdf_backend import PdfPageBackend @@ -28,15 +29,31 @@ class ConversionStatus(str, Enum): class InputFormat(str, Enum): - DOCX = "docx" - PPTX = "pptx" - HTML = "html" - IMAGE = "image" - PDF = "pdf" - ASCIIDOC = "asciidoc" - MD = "md" - XLSX = "xlsx" - XML_USPTO = "uspto" + """A document format supported by document backend parsers. + + The field `is_custom` indicates whether the document format is more specific than + the standard and content formats, typically defined by MIME types. + """ + + DOCX = "docx", False + PPTX = "pptx", False + HTML = "html", False + IMAGE = "image", False + PDF = "pdf", False + ASCIIDOC = "asciidoc", False + MD = "md", False + XLSX = "xlsx", False + XML_USPTO = "uspto", True + + @override + def __new__(cls, value: str, _) -> Self: + obj = str.__new__(cls, [value]) + obj._value_ = value + return obj + + @override + def __init__(self, _, is_custom: bool) -> None: + self.is_custom: bool = is_custom class OutputFormat(str, Enum): @@ -86,8 +103,10 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { InputFormat.XML_USPTO: ["application/xml", "text/plain"], } -MimeTypeToFormat = { - mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes +MimeTypeToFormat: dict[str, list[InputFormat]] = { + mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]] + for value in FormatToMimeType.values() + for mime in value } diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index f8dec5cb..1be71294 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -3,7 +3,17 @@ import re from enum import Enum from io import BytesIO from pathlib import Path, PurePath -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union +from typing import ( + TYPE_CHECKING, + Dict, + Iterable, + List, + Literal, + Optional, + Set, + Type, + Union, +) import filetype from docling_core.types.doc import ( @@ -235,7 +245,7 @@ class _DocumentConversionInput(BaseModel): if isinstance(obj, Path): yield InputDocument( path_or_stream=obj, - format=format, + format=format, # type: ignore[arg-type] filename=obj.name, limits=self.limits, backend=backend, @@ -243,7 +253,7 @@ class _DocumentConversionInput(BaseModel): elif isinstance(obj, DocumentStream): yield InputDocument( path_or_stream=obj.stream, - format=format, + format=format, # type: ignore[arg-type] filename=obj.name, limits=self.limits, backend=backend, @@ -251,15 +261,15 @@ class _DocumentConversionInput(BaseModel): else: raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}") - def _guess_format(self, obj: Union[Path, DocumentStream]): + def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]: content = b"" # empty binary blob - format = None + formats: list[InputFormat] = [] if isinstance(obj, Path): mime = filetype.guess_mime(str(obj)) if mime is None: ext = obj.suffix[1:] - mime = self._mime_from_extension(ext) + mime = _DocumentConversionInput._mime_from_extension(ext) if mime is None: # must guess from with obj.open("rb") as f: content = f.read(1024) # Read first 1KB @@ -274,15 +284,52 @@ class _DocumentConversionInput(BaseModel): if ("." in obj.name and not obj.name.startswith(".")) else "" ) - mime = self._mime_from_extension(ext) + mime = _DocumentConversionInput._mime_from_extension(ext) - mime = mime or self._detect_html_xhtml(content) + mime = mime or _DocumentConversionInput._detect_html_xhtml(content) mime = mime or "text/plain" + formats = MimeTypeToFormat.get(mime, []) + if formats: + if len(formats) == 1 and not formats[0].is_custom: + return formats[0] + else: # ambiguity or custom cases + return _DocumentConversionInput._guess_from_content( + content, mime, formats + ) + else: + return None - format = MimeTypeToFormat.get(mime) - return format + @staticmethod + def _guess_from_content( + content: bytes, mime: str, formats: list[InputFormat] + ) -> Optional[InputFormat]: + """Guess the input format of a document by checking part of its content.""" + input_format: Optional[InputFormat] = None + content_str = content.decode("utf-8") - def _mime_from_extension(self, ext): + if mime == "application/xml": + match_doctype = re.search(r"]+>", content_str) + if match_doctype: + xml_doctype = match_doctype.group() + if InputFormat.XML_USPTO in formats and any( + item in xml_doctype + for item in ( + "us-patent-application-v4", + "us-patent-grant-v4", + "us-grant-025", + "patent-application-publication", + ) + ): + input_format = InputFormat.XML_USPTO + + elif mime == "text/plain": + if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"): + input_format = InputFormat.XML_USPTO + + return input_format + + @staticmethod + def _mime_from_extension(ext): mime = None if ext in FormatToExtensions[InputFormat.ASCIIDOC]: mime = FormatToMimeType[InputFormat.ASCIIDOC][0] @@ -293,7 +340,19 @@ class _DocumentConversionInput(BaseModel): return mime - def _detect_html_xhtml(self, content): + @staticmethod + def _detect_html_xhtml( + content: bytes, + ) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]: + """Guess the mime type of an XHTML, HTML, or XML file from its content. + + Args: + content: A short piece of a document from its beginning. + + Returns: + The mime type of an XHTML, HTML, or XML file, or None if the content does + not match any of these formats. + """ content_str = content.decode("ascii", errors="ignore").lower() # Remove XML comments content_str = re.sub(r"", "", content_str, flags=re.DOTALL) @@ -302,6 +361,8 @@ class _DocumentConversionInput(BaseModel): if re.match(r"<\?xml", content_str): if "xhtml" in content_str[:1000]: return "application/xhtml+xml" + else: + return "application/xml" if re.match(r"Docling parses documents' + ) + doc_path = temp_dir / "docling_test.xml" + doc_path.write_text(xml_content, encoding="utf-8") + assert dci._guess_format(doc_path) == None + buf = BytesIO(Path(doc_path).open("rb").read()) + stream = DocumentStream(name="docling_test.xml", stream=buf) + assert dci._guess_format(stream) == None + + # Invalid USPTO patent (as plain text) + stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz")) + assert dci._guess_format(stream) == None + doc_path = temp_dir / "pftaps_wrong.txt" + doc_path.write_text("xyz", encoding="utf-8") + assert dci._guess_format(doc_path) == None + + def _make_input_doc(path): in_doc = InputDocument( path_or_stream=path,