Introduce page-level error checks

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-08-23 13:06:20 +02:00
parent cae20ac099
commit 21f977544c
3 changed files with 15 additions and 5 deletions

View File

@ -30,6 +30,10 @@ class PdfPageBackend(ABC):
def get_size(self) -> "PageSize": def get_size(self) -> "PageSize":
pass pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod @abstractmethod
def unload(self): def unload(self):
pass pass

View File

@ -20,16 +20,18 @@ class DoclingParsePageBackend(PdfPageBackend):
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
): ):
self._ppage = page_obj self._ppage = page_obj
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
self._dpage = None self._dpage = None
self.broken_page = "pages" not in parsed_page self.valid = "pages" in parsed_page
if not self.broken_page: if self.valid:
self._dpage = parsed_page["pages"][0] self._dpage = parsed_page["pages"][0]
def is_valid(self) -> bool:
return self.valid
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
if self.broken_page: if not self.valid:
return "" return ""
# Find intersecting cells on the page # Find intersecting cells on the page
text_piece = "" text_piece = ""
@ -65,7 +67,7 @@ class DoclingParsePageBackend(PdfPageBackend):
cells = [] cells = []
cell_counter = 0 cell_counter = 0
if self.broken_page: if not self.valid:
return cells return cells
page_size = self.get_size() page_size = self.get_size()

View File

@ -16,6 +16,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int): def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int):
self._ppage: pdfium.PdfPage = pdfium_doc[page_no] self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
self.text_page = None self.text_page = None
self.valid = True
def is_valid(self) -> bool:
return self.valid
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32 AREA_THRESHOLD = 32 * 32