From 79c59cb2b0a8bc25d02003bddbbe3c8955b43b6f Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 25 Jul 2025 15:15:05 +0200 Subject: [PATCH] restore guess format Signed-off-by: Michele Dolfi --- docling/datamodel/document.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index b1ca0372..b9832346 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -325,15 +325,15 @@ class _DocumentConversionInput(BaseModel): formats = MimeTypeToFormat.get(mime, []) _log.info(f"detected formats: {formats}") - input_format: Optional[InputFormat] = None - if len(formats) == 1: - input_format = formats[0] - - if content: - input_format = _DocumentConversionInput._guess_from_content( - content, mime, formats - ) - return input_format + if formats: + if len(formats) == 1 and mime not in ("text/plain"): + return formats[0] + else: # ambiguity in formats + return _DocumentConversionInput._guess_from_content( + content, mime, formats + ) + else: + return None @staticmethod def _guess_from_content( @@ -342,9 +342,6 @@ class _DocumentConversionInput(BaseModel): """Guess the input format of a document by checking part of its content.""" input_format: Optional[InputFormat] = None - if len(formats) == 1: - input_format = formats[0] - if mime == "application/xml": content_str = content.decode("utf-8") match_doctype = re.search(r"]+>", content_str)