From a09273ecb8857088cb219489106b4eb3c70b3840 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 14 Apr 2025 07:06:46 +0200 Subject: [PATCH] apply to other ms office zip formats Signed-off-by: Michele Dolfi --- docling/datamodel/document.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 70d08b75..93dfd1a5 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -283,14 +283,13 @@ class _DocumentConversionInput(BaseModel): if mime is None: # must guess from with obj.open("rb") as f: content = f.read(1024) # Read first 1KB - if ( - mime is not None - and mime.lower() == "application/zip" - and obj.suffixes[-1].lower() == ".xlsx" - ): - mime = ( - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - ) + if mime is not None and mime.lower() == "application/zip": + if obj.suffixes[-1].lower() == ".xlsx": + mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + elif obj.suffixes[-1].lower() == ".docx": + mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + elif obj.suffixes[-1].lower() == ".pptx": + mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation" elif isinstance(obj, DocumentStream): content = obj.stream.read(8192)