[WIP] introducting extra backend abstraction and input formats

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-09-25 11:17:49 +02:00
parent 850a521195
commit 95c539579d
3 changed files with 44 additions and 23 deletions

View File

@ -10,6 +10,24 @@ if TYPE_CHECKING:
from docling.datamodel.base_models import Cell
class AbstractDocumentBackend(ABC):
@abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
self.path_or_stream = path_or_stream
self.document_hash = document_hash
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
class PdfPageBackend(ABC):
@abstractmethod
@ -43,12 +61,7 @@ class PdfPageBackend(ABC):
pass
class PdfDocumentBackend(ABC):
@abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
self.path_or_stream = path_or_stream
self.document_hash = document_hash
class PdfDocumentBackend(AbstractDocumentBackend):
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
@ -56,14 +69,3 @@ class PdfDocumentBackend(ABC):
@abstractmethod
def page_count(self) -> int:
pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None

View File

@ -22,6 +22,14 @@ class ConversionStatus(str, Enum):
PARTIAL_SUCCESS = auto()
class InputFormat(str, Enum):
DOCX = auto()
PPTX = auto()
HTML = auto()
IMAGE = auto()
PDF = auto()
class DocInputType(str, Enum):
PATH = auto()
STREAM = auto()

View File

@ -16,7 +16,7 @@ from docling_core.types.experimental.labels import PageLabel
from pydantic import BaseModel
from typing_extensions import deprecated
from docling.backend.abstract_backend import PdfDocumentBackend
from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
@ -24,6 +24,7 @@ from docling.datamodel.base_models import (
DocumentStream,
ErrorItem,
FigureElement,
InputFormat,
Page,
PageElement,
Table,
@ -65,6 +66,13 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
description={}, file_info=FileInfo(document_hash="123xyz")
) # TODO: Stub
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.PDF: DoclingParseDocumentBackend,
InputFormat.DOCX: None,
InputFormat.PPTX: None,
InputFormat.IMAGE: None,
}
class InputDocument(BaseModel):
file: PurePath = None
@ -82,10 +90,13 @@ class InputDocument(BaseModel):
path_or_stream: Union[BytesIO, Path],
filename: Optional[str] = None,
limits: Optional[DocumentLimits] = None,
pdf_backend=DoclingParseDocumentBackend,
backend: Optional[Type[AbstractDocumentBackend]] = None,
):
super().__init__()
if not backend:
backend = _input_format_default_backends[InputFormat.PDF]
self.limits = limits or DocumentLimits()
try:
@ -96,7 +107,7 @@ class InputDocument(BaseModel):
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
@ -108,7 +119,7 @@ class InputDocument(BaseModel):
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
@ -435,14 +446,14 @@ class DocumentConversionInput(BaseModel):
for obj in self._path_or_stream_iterator:
if isinstance(obj, Path):
yield InputDocument(
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
path_or_stream=obj, limits=self.limits, backend=pdf_backend
)
elif isinstance(obj, DocumentStream):
yield InputDocument(
path_or_stream=obj.stream,
filename=obj.filename,
limits=self.limits,
pdf_backend=pdf_backend,
backend=pdf_backend,
)
@classmethod