mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
Updates for Powerpoint backend
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
89e58ca730
commit
1d55cbdca9
@ -36,6 +36,18 @@ class AbstractDocumentBackend(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PaginatedDocumentBackend(AbstractDocumentBackend):
|
||||||
|
"""DeclarativeDocumentBackend.
|
||||||
|
|
||||||
|
A declarative document backend is a backend that can transform to DoclingDocument
|
||||||
|
straight without a recognition pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def page_count(self) -> int:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
||||||
"""DeclarativeDocumentBackend.
|
"""DeclarativeDocumentBackend.
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ from docling_core.types.experimental import (
|
|||||||
DescriptionItem,
|
DescriptionItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
|
DocumentOrigin,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
ImageRef,
|
ImageRef,
|
||||||
PictureItem,
|
PictureItem,
|
||||||
@ -21,13 +22,16 @@ from pptx import Presentation
|
|||||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||||
from pptx.util import Inches
|
from pptx.util import Inches
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import (
|
||||||
from docling.datamodel.base_models import InputFormat
|
DeclarativeDocumentBackend,
|
||||||
|
PaginatedDocumentBackend,
|
||||||
|
)
|
||||||
|
from docling.datamodel.base_models import FormatToMimeType, InputFormat
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
|
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||||
super().__init__(path_or_stream, document_hash)
|
super().__init__(path_or_stream, document_hash)
|
||||||
self.namespaces = {
|
self.namespaces = {
|
||||||
@ -37,13 +41,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
}
|
}
|
||||||
# Powerpoint file:
|
# Powerpoint file:
|
||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
|
|
||||||
|
self.pptx_obj = None
|
||||||
|
self.valid = True
|
||||||
|
try:
|
||||||
|
self.pptx_obj = Presentation(self.path_or_stream)
|
||||||
|
except Exception:
|
||||||
|
_log.error("could not parse pptx")
|
||||||
|
self.valid = False
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def page_count(self) -> int:
|
||||||
|
if self.is_valid():
|
||||||
|
return len(self.pptx_obj.slides)
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return True
|
return self.valid
|
||||||
|
|
||||||
def is_paginated(cls) -> bool:
|
def is_paginated(cls) -> bool:
|
||||||
return False # True? if so, how to handle pages...
|
return True # True? if so, how to handle pages...
|
||||||
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
@ -57,14 +76,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# Parses the PPTX into a structured document model.
|
# Parses the PPTX into a structured document model.
|
||||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
||||||
pptx_obj = None
|
|
||||||
try:
|
origin = DocumentOrigin(
|
||||||
pptx_obj = Presentation(self.path_or_stream)
|
filename=self.path_or_stream.name,
|
||||||
except Exception:
|
mimetype="application/vnd.ms-powerpoint",
|
||||||
_log.error("could not parse pptx")
|
binary_hash=self.document_hash,
|
||||||
return doc
|
)
|
||||||
doc = self.walk_linear(pptx_obj, doc)
|
doc = DoclingDocument(
|
||||||
|
description=DescriptionItem(), name="name_without_extension", origin=origin
|
||||||
|
) # TODO must add origin information
|
||||||
|
doc = self.walk_linear(self.pptx_obj, doc)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user