Updates for Powerpoint backend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-08 13:19:58 +02:00
parent 89e58ca730
commit 1d55cbdca9
2 changed files with 47 additions and 13 deletions

View File

@ -36,6 +36,18 @@ class AbstractDocumentBackend(ABC):
pass pass
class PaginatedDocumentBackend(AbstractDocumentBackend):
"""DeclarativeDocumentBackend.
A declarative document backend is a backend that can transform to DoclingDocument
straight without a recognition pipeline.
"""
@abstractmethod
def page_count(self) -> int:
pass
class DeclarativeDocumentBackend(AbstractDocumentBackend): class DeclarativeDocumentBackend(AbstractDocumentBackend):
"""DeclarativeDocumentBackend. """DeclarativeDocumentBackend.

View File

@ -9,6 +9,7 @@ from docling_core.types.experimental import (
DescriptionItem, DescriptionItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin,
GroupLabel, GroupLabel,
ImageRef, ImageRef,
PictureItem, PictureItem,
@ -21,13 +22,16 @@ from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
from pptx.util import Inches from pptx.util import Inches
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import (
from docling.datamodel.base_models import InputFormat DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import FormatToMimeType, InputFormat
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend): class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash) super().__init__(path_or_stream, document_hash)
self.namespaces = { self.namespaces = {
@ -37,13 +41,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
} }
# Powerpoint file: # Powerpoint file:
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
self.pptx_obj = None
self.valid = True
try:
self.pptx_obj = Presentation(self.path_or_stream)
except Exception:
_log.error("could not parse pptx")
self.valid = False
return return
def page_count(self) -> int:
if self.is_valid():
return len(self.pptx_obj.slides)
else:
return 0
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return self.valid
def is_paginated(cls) -> bool: def is_paginated(cls) -> bool:
return False # True? if so, how to handle pages... return True # True? if so, how to handle pages...
def unload(self): def unload(self):
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
@ -57,14 +76,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# Parses the PPTX into a structured document model. # Parses the PPTX into a structured document model.
doc = DoclingDocument(description=DescriptionItem(), name="dummy") # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
pptx_obj = None
try: origin = DocumentOrigin(
pptx_obj = Presentation(self.path_or_stream) filename=self.path_or_stream.name,
except Exception: mimetype="application/vnd.ms-powerpoint",
_log.error("could not parse pptx") binary_hash=self.document_hash,
return doc )
doc = self.walk_linear(pptx_obj, doc) doc = DoclingDocument(
description=DescriptionItem(), name="name_without_extension", origin=origin
) # TODO must add origin information
doc = self.walk_linear(self.pptx_obj, doc)
return doc return doc