diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index bfc68f9b..d1b7766d 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -26,9 +26,9 @@ class DoclingParseV2PageBackend(PdfPageBackend): self._ppage = page_obj parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) - self.valid = "pages" in parsed_page + self.valid = "pages" in parsed_page and len(parsed_page["pages"])==1 if self.valid: - self._dpage = parsed_page["pages"][page_no] + self._dpage = parsed_page["pages"][0] else: _log.info( f"An error occured when loading page {page_no} of document {document_hash}." @@ -223,8 +223,16 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend): ) def page_count(self) -> int: - return len(self._pdoc) # To be replaced with docling-parse API + #return len(self._pdoc) # To be replaced with docling-parse API + + len_1 = len(self._pdoc) + len_2 = self.parser.number_of_pages(self.document_hash) + + if len_1!=len_2: + _log.error(f"Inconsistent number of pages: {len_1}!={len_2}") + return len_2 + def load_page(self, page_no: int) -> DoclingParseV2PageBackend: return DoclingParseV2PageBackend( self.parser, self.document_hash, page_no, self._pdoc[page_no] diff --git a/pyproject.toml b/pyproject.toml index db208af9..13ac6f66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ huggingface_hub = ">=0.23,<1" requests = "^2.32.3" easyocr = "^1.7" tesserocr = { version = "^2.7.1", optional = true } -docling-parse = "^1.6.0" +docling-parse = "^2.0.0" certifi = ">=2024.7.4" rtree = "^1.3.0" scipy = "^1.14.1"