mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
restore guess format
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
bbb735d2de
commit
79c59cb2b0
@ -325,15 +325,15 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
formats = MimeTypeToFormat.get(mime, [])
|
formats = MimeTypeToFormat.get(mime, [])
|
||||||
_log.info(f"detected formats: {formats}")
|
_log.info(f"detected formats: {formats}")
|
||||||
|
|
||||||
input_format: Optional[InputFormat] = None
|
if formats:
|
||||||
if len(formats) == 1:
|
if len(formats) == 1 and mime not in ("text/plain"):
|
||||||
input_format = formats[0]
|
return formats[0]
|
||||||
|
else: # ambiguity in formats
|
||||||
if content:
|
return _DocumentConversionInput._guess_from_content(
|
||||||
input_format = _DocumentConversionInput._guess_from_content(
|
content, mime, formats
|
||||||
content, mime, formats
|
)
|
||||||
)
|
else:
|
||||||
return input_format
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _guess_from_content(
|
def _guess_from_content(
|
||||||
@ -342,9 +342,6 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
"""Guess the input format of a document by checking part of its content."""
|
"""Guess the input format of a document by checking part of its content."""
|
||||||
input_format: Optional[InputFormat] = None
|
input_format: Optional[InputFormat] = None
|
||||||
|
|
||||||
if len(formats) == 1:
|
|
||||||
input_format = formats[0]
|
|
||||||
|
|
||||||
if mime == "application/xml":
|
if mime == "application/xml":
|
||||||
content_str = content.decode("utf-8")
|
content_str = content.decode("utf-8")
|
||||||
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
||||||
|
Loading…
Reference in New Issue
Block a user