Merge branch 'docling-project:main' into main

This commit is contained in:
ShiroYasha18 2025-05-27 18:02:13 +05:30 committed by GitHub
commit c4c59204d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
) -> Optional[InputFormat]: ) -> Optional[InputFormat]:
"""Guess the input format of a document by checking part of its content.""" """Guess the input format of a document by checking part of its content."""
input_format: Optional[InputFormat] = None input_format: Optional[InputFormat] = None
content_str = content.decode("utf-8")
if mime == "application/xml": if mime == "application/xml":
content_str = content.decode("utf-8")
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str) match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
if match_doctype: if match_doctype:
xml_doctype = match_doctype.group() xml_doctype = match_doctype.group()
@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
input_format = InputFormat.XML_JATS input_format = InputFormat.XML_JATS
elif mime == "text/plain": elif mime == "text/plain":
content_str = content.decode("utf-8")
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"): if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
input_format = InputFormat.XML_USPTO input_format = InputFormat.XML_USPTO