mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-15 16:18:22 +00:00
feat: Upgrade docling-parse PDF backend and interface to use page-by-page parsing (#44)
* Use docling-parse page-by-page Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Propagate document_hash to PDF backends, use docling-parse 1.0.0 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Upgrade lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * repin after more packages on pypi Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -79,7 +79,9 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = pdf_backend(path_or_stream=path_or_stream)
|
||||
self._backend = pdf_backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
self.file = PurePath(filename)
|
||||
@@ -89,7 +91,9 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = pdf_backend(path_or_stream=path_or_stream)
|
||||
self._backend = pdf_backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
|
||||
if self.document_hash and self._backend.page_count() > 0:
|
||||
self.page_count = self._backend.page_count()
|
||||
|
||||
Reference in New Issue
Block a user