From a07a18715041a26a4602bbcee6d853c526e4ddbe Mon Sep 17 00:00:00 2001 From: Maxim Lysak Date: Wed, 16 Oct 2024 12:22:19 +0200 Subject: [PATCH] Added and fixed origin for msword and mspowerpoint backend Signed-off-by: Maxim Lysak --- docling/backend/mspowerpoint_backend.py | 6 +++++- docling/backend/msword_backend.py | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index fba3e31d..91f1d11f 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -91,8 +91,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB mimetype="application/vnd.ms-powerpoint", binary_hash=self.document_hash, ) + if len(fname) > 0: + docname = Path(fname).stem + else: + docname = "stream" doc = DoclingDocument( - description=DescriptionItem(), name="name_without_extension", origin=origin + description=DescriptionItem(), name=docname, origin=origin ) # must add origin information doc = self.walk_linear(self.pptx_obj, doc) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 182e31a9..0de32682 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -8,6 +8,7 @@ from docling_core.types.doc import ( DescriptionItem, DocItemLabel, DoclingDocument, + DocumentOrigin, GroupLabel, TableCell, TableData, @@ -84,7 +85,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: # Parses the DOCX into a structured document model. - doc = DoclingDocument(description=DescriptionItem(), name="dummy") + + fname = "" + if isinstance(self.path_or_stream, Path): + fname = self.path_or_stream.name + + origin = DocumentOrigin( + filename=fname, + mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + binary_hash=self.document_hash, + ) + if len(fname) > 0: + docname = Path(fname).stem + else: + docname = "stream" + doc = DoclingDocument( + description=DescriptionItem(), name=docname, origin=origin + ) if self.is_valid(): assert self.docx_obj is not None doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)