From 95c539579d33c1252c7a6a2f36f5cc37f038e0d7 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 25 Sep 2024 11:17:49 +0200 Subject: [PATCH] [WIP] introducting extra backend abstraction and input formats Signed-off-by: Christoph Auer --- docling/backend/abstract_backend.py | 36 +++++++++++++++-------------- docling/datamodel/base_models.py | 8 +++++++ docling/datamodel/document.py | 23 +++++++++++++----- 3 files changed, 44 insertions(+), 23 deletions(-) diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 22fdc1b2..646afe3c 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -10,6 +10,24 @@ if TYPE_CHECKING: from docling.datamodel.base_models import Cell +class AbstractDocumentBackend(ABC): + @abstractmethod + def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + self.path_or_stream = path_or_stream + self.document_hash = document_hash + + @abstractmethod + def is_valid(self) -> bool: + pass + + @abstractmethod + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + + self.path_or_stream = None + + class PdfPageBackend(ABC): @abstractmethod @@ -43,12 +61,7 @@ class PdfPageBackend(ABC): pass -class PdfDocumentBackend(ABC): - @abstractmethod - def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): - self.path_or_stream = path_or_stream - self.document_hash = document_hash - +class PdfDocumentBackend(AbstractDocumentBackend): @abstractmethod def load_page(self, page_no: int) -> PdfPageBackend: pass @@ -56,14 +69,3 @@ class PdfDocumentBackend(ABC): @abstractmethod def page_count(self) -> int: pass - - @abstractmethod - def is_valid(self) -> bool: - pass - - @abstractmethod - def unload(self): - if isinstance(self.path_or_stream, BytesIO): - self.path_or_stream.close() - - self.path_or_stream = None diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 90d9de98..ee289d39 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -22,6 +22,14 @@ class ConversionStatus(str, Enum): PARTIAL_SUCCESS = auto() +class InputFormat(str, Enum): + DOCX = auto() + PPTX = auto() + HTML = auto() + IMAGE = auto() + PDF = auto() + + class DocInputType(str, Enum): PATH = auto() STREAM = auto() diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 735d23e9..ace1862a 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -16,7 +16,7 @@ from docling_core.types.experimental.labels import PageLabel from pydantic import BaseModel from typing_extensions import deprecated -from docling.backend.abstract_backend import PdfDocumentBackend +from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import ( AssembledUnit, @@ -24,6 +24,7 @@ from docling.datamodel.base_models import ( DocumentStream, ErrorItem, FigureElement, + InputFormat, Page, PageElement, Table, @@ -65,6 +66,13 @@ _EMPTY_DOCLING_DOC = DoclingDocument( description={}, file_info=FileInfo(document_hash="123xyz") ) # TODO: Stub +_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = { + InputFormat.PDF: DoclingParseDocumentBackend, + InputFormat.DOCX: None, + InputFormat.PPTX: None, + InputFormat.IMAGE: None, +} + class InputDocument(BaseModel): file: PurePath = None @@ -82,10 +90,13 @@ class InputDocument(BaseModel): path_or_stream: Union[BytesIO, Path], filename: Optional[str] = None, limits: Optional[DocumentLimits] = None, - pdf_backend=DoclingParseDocumentBackend, + backend: Optional[Type[AbstractDocumentBackend]] = None, ): super().__init__() + if not backend: + backend = _input_format_default_backends[InputFormat.PDF] + self.limits = limits or DocumentLimits() try: @@ -96,7 +107,7 @@ class InputDocument(BaseModel): self.valid = False else: self.document_hash = create_file_hash(path_or_stream) - self._backend = pdf_backend( + self._backend = backend( path_or_stream=path_or_stream, document_hash=self.document_hash ) @@ -108,7 +119,7 @@ class InputDocument(BaseModel): self.valid = False else: self.document_hash = create_file_hash(path_or_stream) - self._backend = pdf_backend( + self._backend = backend( path_or_stream=path_or_stream, document_hash=self.document_hash ) @@ -435,14 +446,14 @@ class DocumentConversionInput(BaseModel): for obj in self._path_or_stream_iterator: if isinstance(obj, Path): yield InputDocument( - path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend + path_or_stream=obj, limits=self.limits, backend=pdf_backend ) elif isinstance(obj, DocumentStream): yield InputDocument( path_or_stream=obj.stream, filename=obj.filename, limits=self.limits, - pdf_backend=pdf_backend, + backend=pdf_backend, ) @classmethod