mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-09 13:18:24 +00:00
fix: Upgrade docling-parse to 1.1.1, safety checks for failed parse on pages (#45)
* Put safety-checks for failed parse of pages Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Bump to docling-parse 1.1.1 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -23,9 +23,15 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
self._ppage = page_obj
|
||||
|
||||
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
||||
self._dpage = parsed_page["pages"][0]
|
||||
|
||||
self._dpage = None
|
||||
self.broken_page = "pages" not in parsed_page
|
||||
if not self.broken_page:
|
||||
self._dpage = parsed_page["pages"][0]
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
if self.broken_page:
|
||||
return ""
|
||||
# Find intersecting cells on the page
|
||||
text_piece = ""
|
||||
page_size = self.get_size()
|
||||
@@ -60,6 +66,9 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
cells = []
|
||||
cell_counter = 0
|
||||
|
||||
if self.broken_page:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["width"]
|
||||
|
||||
Reference in New Issue
Block a user