[WIP] introducting extra backend abstraction and input formats

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-09-25 11:17:49 +02:00
parent 850a521195
commit 95c539579d
3 changed files with 44 additions and 23 deletions

View File

@ -10,6 +10,24 @@ if TYPE_CHECKING:
from docling.datamodel.base_models import Cell from docling.datamodel.base_models import Cell
class AbstractDocumentBackend(ABC):
@abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
self.path_or_stream = path_or_stream
self.document_hash = document_hash
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
class PdfPageBackend(ABC): class PdfPageBackend(ABC):
@abstractmethod @abstractmethod
@ -43,12 +61,7 @@ class PdfPageBackend(ABC):
pass pass
class PdfDocumentBackend(ABC): class PdfDocumentBackend(AbstractDocumentBackend):
@abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
self.path_or_stream = path_or_stream
self.document_hash = document_hash
@abstractmethod @abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend: def load_page(self, page_no: int) -> PdfPageBackend:
pass pass
@ -56,14 +69,3 @@ class PdfDocumentBackend(ABC):
@abstractmethod @abstractmethod
def page_count(self) -> int: def page_count(self) -> int:
pass pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None

View File

@ -22,6 +22,14 @@ class ConversionStatus(str, Enum):
PARTIAL_SUCCESS = auto() PARTIAL_SUCCESS = auto()
class InputFormat(str, Enum):
DOCX = auto()
PPTX = auto()
HTML = auto()
IMAGE = auto()
PDF = auto()
class DocInputType(str, Enum): class DocInputType(str, Enum):
PATH = auto() PATH = auto()
STREAM = auto() STREAM = auto()

View File

@ -16,7 +16,7 @@ from docling_core.types.experimental.labels import PageLabel
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
from docling.backend.abstract_backend import PdfDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
@ -24,6 +24,7 @@ from docling.datamodel.base_models import (
DocumentStream, DocumentStream,
ErrorItem, ErrorItem,
FigureElement, FigureElement,
InputFormat,
Page, Page,
PageElement, PageElement,
Table, Table,
@ -65,6 +66,13 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
description={}, file_info=FileInfo(document_hash="123xyz") description={}, file_info=FileInfo(document_hash="123xyz")
) # TODO: Stub ) # TODO: Stub
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.PDF: DoclingParseDocumentBackend,
InputFormat.DOCX: None,
InputFormat.PPTX: None,
InputFormat.IMAGE: None,
}
class InputDocument(BaseModel): class InputDocument(BaseModel):
file: PurePath = None file: PurePath = None
@ -82,10 +90,13 @@ class InputDocument(BaseModel):
path_or_stream: Union[BytesIO, Path], path_or_stream: Union[BytesIO, Path],
filename: Optional[str] = None, filename: Optional[str] = None,
limits: Optional[DocumentLimits] = None, limits: Optional[DocumentLimits] = None,
pdf_backend=DoclingParseDocumentBackend, backend: Optional[Type[AbstractDocumentBackend]] = None,
): ):
super().__init__() super().__init__()
if not backend:
backend = _input_format_default_backends[InputFormat.PDF]
self.limits = limits or DocumentLimits() self.limits = limits or DocumentLimits()
try: try:
@ -96,7 +107,7 @@ class InputDocument(BaseModel):
self.valid = False self.valid = False
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend( self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash path_or_stream=path_or_stream, document_hash=self.document_hash
) )
@ -108,7 +119,7 @@ class InputDocument(BaseModel):
self.valid = False self.valid = False
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend( self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash path_or_stream=path_or_stream, document_hash=self.document_hash
) )
@ -435,14 +446,14 @@ class DocumentConversionInput(BaseModel):
for obj in self._path_or_stream_iterator: for obj in self._path_or_stream_iterator:
if isinstance(obj, Path): if isinstance(obj, Path):
yield InputDocument( yield InputDocument(
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend path_or_stream=obj, limits=self.limits, backend=pdf_backend
) )
elif isinstance(obj, DocumentStream): elif isinstance(obj, DocumentStream):
yield InputDocument( yield InputDocument(
path_or_stream=obj.stream, path_or_stream=obj.stream,
filename=obj.filename, filename=obj.filename,
limits=self.limits, limits=self.limits,
pdf_backend=pdf_backend, backend=pdf_backend,
) )
@classmethod @classmethod