mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
[WIP] introducting extra backend abstraction and input formats
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
850a521195
commit
95c539579d
@ -10,6 +10,24 @@ if TYPE_CHECKING:
|
||||
from docling.datamodel.base_models import Cell
|
||||
|
||||
|
||||
class AbstractDocumentBackend(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
self.path_or_stream = path_or_stream
|
||||
self.document_hash = document_hash
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
|
||||
self.path_or_stream = None
|
||||
|
||||
|
||||
class PdfPageBackend(ABC):
|
||||
|
||||
@abstractmethod
|
||||
@ -43,12 +61,7 @@ class PdfPageBackend(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class PdfDocumentBackend(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
self.path_or_stream = path_or_stream
|
||||
self.document_hash = document_hash
|
||||
|
||||
class PdfDocumentBackend(AbstractDocumentBackend):
|
||||
@abstractmethod
|
||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||
pass
|
||||
@ -56,14 +69,3 @@ class PdfDocumentBackend(ABC):
|
||||
@abstractmethod
|
||||
def page_count(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
|
||||
self.path_or_stream = None
|
||||
|
@ -22,6 +22,14 @@ class ConversionStatus(str, Enum):
|
||||
PARTIAL_SUCCESS = auto()
|
||||
|
||||
|
||||
class InputFormat(str, Enum):
|
||||
DOCX = auto()
|
||||
PPTX = auto()
|
||||
HTML = auto()
|
||||
IMAGE = auto()
|
||||
PDF = auto()
|
||||
|
||||
|
||||
class DocInputType(str, Enum):
|
||||
PATH = auto()
|
||||
STREAM = auto()
|
||||
|
@ -16,7 +16,7 @@ from docling_core.types.experimental.labels import PageLabel
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
@ -24,6 +24,7 @@ from docling.datamodel.base_models import (
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
FigureElement,
|
||||
InputFormat,
|
||||
Page,
|
||||
PageElement,
|
||||
Table,
|
||||
@ -65,6 +66,13 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
|
||||
description={}, file_info=FileInfo(document_hash="123xyz")
|
||||
) # TODO: Stub
|
||||
|
||||
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||
InputFormat.PDF: DoclingParseDocumentBackend,
|
||||
InputFormat.DOCX: None,
|
||||
InputFormat.PPTX: None,
|
||||
InputFormat.IMAGE: None,
|
||||
}
|
||||
|
||||
|
||||
class InputDocument(BaseModel):
|
||||
file: PurePath = None
|
||||
@ -82,10 +90,13 @@ class InputDocument(BaseModel):
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
filename: Optional[str] = None,
|
||||
limits: Optional[DocumentLimits] = None,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if not backend:
|
||||
backend = _input_format_default_backends[InputFormat.PDF]
|
||||
|
||||
self.limits = limits or DocumentLimits()
|
||||
|
||||
try:
|
||||
@ -96,7 +107,7 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = pdf_backend(
|
||||
self._backend = backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
|
||||
@ -108,7 +119,7 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = pdf_backend(
|
||||
self._backend = backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
|
||||
@ -435,14 +446,14 @@ class DocumentConversionInput(BaseModel):
|
||||
for obj in self._path_or_stream_iterator:
|
||||
if isinstance(obj, Path):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
|
||||
path_or_stream=obj, limits=self.limits, backend=pdf_backend
|
||||
)
|
||||
elif isinstance(obj, DocumentStream):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj.stream,
|
||||
filename=obj.filename,
|
||||
limits=self.limits,
|
||||
pdf_backend=pdf_backend,
|
||||
backend=pdf_backend,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
|
Loading…
Reference in New Issue
Block a user