feat: Upgrade docling-parse PDF backend and interface to use page-by-page parsing (#44)

* Use docling-parse page-by-page Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Propagate document_hash to PDF backends, use docling-parse 1.0.0 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Upgrade lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * repin after more packages on pypi Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-15 16:18:22 +00:00 · 2024-08-22 13:49:37 +02:00
parent f7c50c8b0e
commit a8c6b29a67
8 changed files with 73 additions and 51 deletions
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -79,7 +79,9 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+                    self._backend = pdf_backend(
+                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    )

            elif isinstance(path_or_stream, BytesIO):
                self.file = PurePath(filename)
@@ -89,7 +91,9 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+                    self._backend = pdf_backend(
+                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    )

            if self.document_hash and self._backend.page_count() > 0:
                self.page_count = self._backend.page_count()