From 47c21a5edc038a90ff3b25829f52f5684dba3b6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o?= Date: Thu, 9 Jan 2025 18:24:26 -0300 Subject: [PATCH] disabled auto file mime type detection, rely on extension --- docling/datamodel/document.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 0483dc2a..0966de83 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -273,7 +273,8 @@ class _DocumentConversionInput(BaseModel): formats: list[InputFormat] = [] if isinstance(obj, Path): - mime = filetype.guess_mime(str(obj)) + # mime = filetype.guess_mime(str(obj)) # We're having too much conflicts with documents being worngly classified as ZIP + mime = None if mime is None: ext = obj.suffix[1:] mime = _DocumentConversionInput._mime_from_extension(ext) @@ -359,7 +360,7 @@ class _DocumentConversionInput(BaseModel): mime = FormatToMimeType[InputFormat.PPTX][0] elif ext in FormatToExtensions[InputFormat.XLSX]: mime = FormatToMimeType[InputFormat.XLSX][0] - + return mime @staticmethod