feat: update parser with bytesio interface and set as new default backend (#32)

* update parser with bytesio interface

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* change default backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update DEFAULT_BACKEND

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2024-08-14 12:30:00 +02:00
committed by GitHub
parent 61be78a875
commit 90dd676422
4 changed files with 86 additions and 32 deletions

View File

@@ -150,10 +150,11 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
super().__init__(path_or_stream)
self._pdoc = pdfium.PdfDocument(path_or_stream)
# Parsing cells with docling_parser call
if isinstance(path_or_stream, BytesIO):
raise NotImplemented("This backend does not support byte streams yet.")
parser = pdf_parser()
self._parser_doc = parser.find_cells(str(path_or_stream))
if isinstance(path_or_stream, BytesIO):
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
else:
self._parser_doc = parser.find_cells(str(path_or_stream))
def page_count(self) -> int:
return len(self._parser_doc["pages"])