feat: update parser with bytesio interface and set as new default backend (#32)

* update parser with bytesio interface

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* change default backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update DEFAULT_BACKEND

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2024-08-14 12:30:00 +02:00
committed by GitHub
parent 61be78a875
commit 90dd676422
4 changed files with 86 additions and 32 deletions

View File

@@ -14,7 +14,7 @@ from docling_core.types import TableCell
from pydantic import BaseModel
from docling.backend.abstract_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
@@ -64,7 +64,7 @@ class InputDocument(BaseModel):
path_or_stream: Union[BytesIO, Path],
filename: Optional[str] = None,
limits: Optional[DocumentLimits] = None,
pdf_backend=PyPdfiumDocumentBackend,
pdf_backend=DoclingParseDocumentBackend,
):
super().__init__()
@@ -308,7 +308,7 @@ class DocumentConversionInput(BaseModel):
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
limits: Optional[DocumentLimits] = DocumentLimits()
DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
def docs(
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None