From 0435bfe4e48fdeaa270be6f41d5224d3a6d4b802 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Thu, 14 Nov 2024 07:49:56 +0100 Subject: [PATCH] feat: added excel backend Signed-off-by: Peter Staar --- docling/backend/msexcel_backend.py | 56 ++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 docling/backend/msexcel_backend.py diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py new file mode 100644 index 00000000..7a944e85 --- /dev/null +++ b/docling/backend/msexcel_backend.py @@ -0,0 +1,56 @@ +import logging +from io import BytesIO +from pathlib import Path +from typing import Set, Union + +from lxml import etree + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class MsExcelDocumentBackend(DeclarativeDocumentBackend): + + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + def is_valid(self) -> bool: + return self.valid + + @classmethod + def supports_pagination(cls) -> bool: + return True + + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + + self.path_or_stream = None + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.EXCEL} + + def convert(self) -> DoclingDocument: + # Parses the DOCX into a structured document model. + + origin = DocumentOrigin( + filename=self.file.name or "file", + mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + binary_hash=self.document_hash, + ) + + doc = DoclingDocument(name=self.file.stem or "file", origin=origin) + if self.is_valid(): + # FIXME: do implementation + return doc + else: + raise RuntimeError( + f"Cannot convert doc with {self.document_hash} because the backend failed to init." + ) + + +