mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
[WIP] introducting extra backend abstraction and input formats
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
850a521195
commit
95c539579d
@ -10,6 +10,24 @@ if TYPE_CHECKING:
|
|||||||
from docling.datamodel.base_models import Cell
|
from docling.datamodel.base_models import Cell
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractDocumentBackend(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||||
|
self.path_or_stream = path_or_stream
|
||||||
|
self.document_hash = document_hash
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def unload(self):
|
||||||
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
|
self.path_or_stream.close()
|
||||||
|
|
||||||
|
self.path_or_stream = None
|
||||||
|
|
||||||
|
|
||||||
class PdfPageBackend(ABC):
|
class PdfPageBackend(ABC):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -43,12 +61,7 @@ class PdfPageBackend(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PdfDocumentBackend(ABC):
|
class PdfDocumentBackend(AbstractDocumentBackend):
|
||||||
@abstractmethod
|
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
|
||||||
self.path_or_stream = path_or_stream
|
|
||||||
self.document_hash = document_hash
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||||
pass
|
pass
|
||||||
@ -56,14 +69,3 @@ class PdfDocumentBackend(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def page_count(self) -> int:
|
def page_count(self) -> int:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def is_valid(self) -> bool:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def unload(self):
|
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
|
||||||
self.path_or_stream.close()
|
|
||||||
|
|
||||||
self.path_or_stream = None
|
|
||||||
|
@ -22,6 +22,14 @@ class ConversionStatus(str, Enum):
|
|||||||
PARTIAL_SUCCESS = auto()
|
PARTIAL_SUCCESS = auto()
|
||||||
|
|
||||||
|
|
||||||
|
class InputFormat(str, Enum):
|
||||||
|
DOCX = auto()
|
||||||
|
PPTX = auto()
|
||||||
|
HTML = auto()
|
||||||
|
IMAGE = auto()
|
||||||
|
PDF = auto()
|
||||||
|
|
||||||
|
|
||||||
class DocInputType(str, Enum):
|
class DocInputType(str, Enum):
|
||||||
PATH = auto()
|
PATH = auto()
|
||||||
STREAM = auto()
|
STREAM = auto()
|
||||||
|
@ -16,7 +16,7 @@ from docling_core.types.experimental.labels import PageLabel
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
AssembledUnit,
|
AssembledUnit,
|
||||||
@ -24,6 +24,7 @@ from docling.datamodel.base_models import (
|
|||||||
DocumentStream,
|
DocumentStream,
|
||||||
ErrorItem,
|
ErrorItem,
|
||||||
FigureElement,
|
FigureElement,
|
||||||
|
InputFormat,
|
||||||
Page,
|
Page,
|
||||||
PageElement,
|
PageElement,
|
||||||
Table,
|
Table,
|
||||||
@ -65,6 +66,13 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
|
|||||||
description={}, file_info=FileInfo(document_hash="123xyz")
|
description={}, file_info=FileInfo(document_hash="123xyz")
|
||||||
) # TODO: Stub
|
) # TODO: Stub
|
||||||
|
|
||||||
|
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||||
|
InputFormat.PDF: DoclingParseDocumentBackend,
|
||||||
|
InputFormat.DOCX: None,
|
||||||
|
InputFormat.PPTX: None,
|
||||||
|
InputFormat.IMAGE: None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class InputDocument(BaseModel):
|
class InputDocument(BaseModel):
|
||||||
file: PurePath = None
|
file: PurePath = None
|
||||||
@ -82,10 +90,13 @@ class InputDocument(BaseModel):
|
|||||||
path_or_stream: Union[BytesIO, Path],
|
path_or_stream: Union[BytesIO, Path],
|
||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
limits: Optional[DocumentLimits] = None,
|
limits: Optional[DocumentLimits] = None,
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
if not backend:
|
||||||
|
backend = _input_format_default_backends[InputFormat.PDF]
|
||||||
|
|
||||||
self.limits = limits or DocumentLimits()
|
self.limits = limits or DocumentLimits()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -96,7 +107,7 @@ class InputDocument(BaseModel):
|
|||||||
self.valid = False
|
self.valid = False
|
||||||
else:
|
else:
|
||||||
self.document_hash = create_file_hash(path_or_stream)
|
self.document_hash = create_file_hash(path_or_stream)
|
||||||
self._backend = pdf_backend(
|
self._backend = backend(
|
||||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -108,7 +119,7 @@ class InputDocument(BaseModel):
|
|||||||
self.valid = False
|
self.valid = False
|
||||||
else:
|
else:
|
||||||
self.document_hash = create_file_hash(path_or_stream)
|
self.document_hash = create_file_hash(path_or_stream)
|
||||||
self._backend = pdf_backend(
|
self._backend = backend(
|
||||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -435,14 +446,14 @@ class DocumentConversionInput(BaseModel):
|
|||||||
for obj in self._path_or_stream_iterator:
|
for obj in self._path_or_stream_iterator:
|
||||||
if isinstance(obj, Path):
|
if isinstance(obj, Path):
|
||||||
yield InputDocument(
|
yield InputDocument(
|
||||||
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
|
path_or_stream=obj, limits=self.limits, backend=pdf_backend
|
||||||
)
|
)
|
||||||
elif isinstance(obj, DocumentStream):
|
elif isinstance(obj, DocumentStream):
|
||||||
yield InputDocument(
|
yield InputDocument(
|
||||||
path_or_stream=obj.stream,
|
path_or_stream=obj.stream,
|
||||||
filename=obj.filename,
|
filename=obj.filename,
|
||||||
limits=self.limits,
|
limits=self.limits,
|
||||||
pdf_backend=pdf_backend,
|
backend=pdf_backend,
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
Loading…
Reference in New Issue
Block a user