mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction
This commit is contained in:
commit
07206c5b3e
@ -91,8 +91,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
mimetype="application/vnd.ms-powerpoint",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
if len(fname) > 0:
|
||||
docname = Path(fname).stem
|
||||
else:
|
||||
docname = "stream"
|
||||
doc = DoclingDocument(
|
||||
description=DescriptionItem(), name="name_without_extension", origin=origin
|
||||
description=DescriptionItem(), name=docname, origin=origin
|
||||
) # must add origin information
|
||||
doc = self.walk_linear(self.pptx_obj, doc)
|
||||
|
||||
|
@ -8,6 +8,7 @@ from docling_core.types.doc import (
|
||||
DescriptionItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
TableCell,
|
||||
TableData,
|
||||
@ -84,7 +85,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
# Parses the DOCX into a structured document model.
|
||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||
|
||||
fname = ""
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
fname = self.path_or_stream.name
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=fname,
|
||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
if len(fname) > 0:
|
||||
docname = Path(fname).stem
|
||||
else:
|
||||
docname = "stream"
|
||||
doc = DoclingDocument(
|
||||
description=DescriptionItem(), name=docname, origin=origin
|
||||
)
|
||||
if self.is_valid():
|
||||
assert self.docx_obj is not None
|
||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||
|
Loading…
Reference in New Issue
Block a user