mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Merge branch 'docling-project:main' into main
This commit is contained in:
commit
c4c59204d6
@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
) -> Optional[InputFormat]:
|
) -> Optional[InputFormat]:
|
||||||
"""Guess the input format of a document by checking part of its content."""
|
"""Guess the input format of a document by checking part of its content."""
|
||||||
input_format: Optional[InputFormat] = None
|
input_format: Optional[InputFormat] = None
|
||||||
content_str = content.decode("utf-8")
|
|
||||||
|
|
||||||
if mime == "application/xml":
|
if mime == "application/xml":
|
||||||
|
content_str = content.decode("utf-8")
|
||||||
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
||||||
if match_doctype:
|
if match_doctype:
|
||||||
xml_doctype = match_doctype.group()
|
xml_doctype = match_doctype.group()
|
||||||
@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
input_format = InputFormat.XML_JATS
|
input_format = InputFormat.XML_JATS
|
||||||
|
|
||||||
elif mime == "text/plain":
|
elif mime == "text/plain":
|
||||||
|
content_str = content.decode("utf-8")
|
||||||
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
||||||
input_format = InputFormat.XML_USPTO
|
input_format = InputFormat.XML_USPTO
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user