Added and fixed origin for msword and mspowerpoint backend

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-10-16 12:22:19 +02:00
parent d5f161d0f5
commit a07a187150
2 changed files with 23 additions and 2 deletions

View File

@ -91,8 +91,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
mimetype="application/vnd.ms-powerpoint", mimetype="application/vnd.ms-powerpoint",
binary_hash=self.document_hash, binary_hash=self.document_hash,
) )
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument( doc = DoclingDocument(
description=DescriptionItem(), name="name_without_extension", origin=origin description=DescriptionItem(), name=docname, origin=origin
) # must add origin information ) # must add origin information
doc = self.walk_linear(self.pptx_obj, doc) doc = self.walk_linear(self.pptx_obj, doc)

View File

@ -8,6 +8,7 @@ from docling_core.types.doc import (
DescriptionItem, DescriptionItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin,
GroupLabel, GroupLabel,
TableCell, TableCell,
TableData, TableData,
@ -84,7 +85,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model. # Parses the DOCX into a structured document model.
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(
description=DescriptionItem(), name=docname, origin=origin
)
if self.is_valid(): if self.is_valid():
assert self.docx_obj is not None assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)