mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction
This commit is contained in:
commit
07206c5b3e
@ -91,8 +91,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
mimetype="application/vnd.ms-powerpoint",
|
mimetype="application/vnd.ms-powerpoint",
|
||||||
binary_hash=self.document_hash,
|
binary_hash=self.document_hash,
|
||||||
)
|
)
|
||||||
|
if len(fname) > 0:
|
||||||
|
docname = Path(fname).stem
|
||||||
|
else:
|
||||||
|
docname = "stream"
|
||||||
doc = DoclingDocument(
|
doc = DoclingDocument(
|
||||||
description=DescriptionItem(), name="name_without_extension", origin=origin
|
description=DescriptionItem(), name=docname, origin=origin
|
||||||
) # must add origin information
|
) # must add origin information
|
||||||
doc = self.walk_linear(self.pptx_obj, doc)
|
doc = self.walk_linear(self.pptx_obj, doc)
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ from docling_core.types.doc import (
|
|||||||
DescriptionItem,
|
DescriptionItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
|
DocumentOrigin,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
@ -84,7 +85,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# Parses the DOCX into a structured document model.
|
# Parses the DOCX into a structured document model.
|
||||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
|
||||||
|
fname = ""
|
||||||
|
if isinstance(self.path_or_stream, Path):
|
||||||
|
fname = self.path_or_stream.name
|
||||||
|
|
||||||
|
origin = DocumentOrigin(
|
||||||
|
filename=fname,
|
||||||
|
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
binary_hash=self.document_hash,
|
||||||
|
)
|
||||||
|
if len(fname) > 0:
|
||||||
|
docname = Path(fname).stem
|
||||||
|
else:
|
||||||
|
docname = "stream"
|
||||||
|
doc = DoclingDocument(
|
||||||
|
description=DescriptionItem(), name=docname, origin=origin
|
||||||
|
)
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
assert self.docx_obj is not None
|
assert self.docx_obj is not None
|
||||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||||
|
Loading…
Reference in New Issue
Block a user