fix: Upgrade docling-parse to 1.1.1, safety checks for failed parse on pages (#45)

* Put safety-checks for failed parse of pages

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Bump to docling-parse 1.1.1

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-08-23 12:51:02 +02:00
committed by GitHub
parent 1930f08d4e
commit 7e84533299
3 changed files with 39 additions and 28 deletions

View File

@@ -23,9 +23,15 @@ class DoclingParsePageBackend(PdfPageBackend):
self._ppage = page_obj
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
self._dpage = parsed_page["pages"][0]
self._dpage = None
self.broken_page = "pages" not in parsed_page
if not self.broken_page:
self._dpage = parsed_page["pages"][0]
def get_text_in_rect(self, bbox: BoundingBox) -> str:
if self.broken_page:
return ""
# Find intersecting cells on the page
text_piece = ""
page_size = self.get_size()
@@ -60,6 +66,9 @@ class DoclingParsePageBackend(PdfPageBackend):
cells = []
cell_counter = 0
if self.broken_page:
return cells
page_size = self.get_size()
parser_width = self._dpage["width"]