From 21f977544c27ab1c8521f471ee4ab000f152add7 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 23 Aug 2024 13:06:20 +0200 Subject: [PATCH] Introduce page-level error checks Signed-off-by: Christoph Auer --- docling/backend/abstract_backend.py | 4 ++++ docling/backend/docling_parse_backend.py | 12 +++++++----- docling/backend/pypdfium2_backend.py | 4 ++++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 91da4ef2..7bb53fce 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -30,6 +30,10 @@ class PdfPageBackend(ABC): def get_size(self) -> "PageSize": pass + @abstractmethod + def is_valid(self) -> bool: + pass + @abstractmethod def unload(self): pass diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index aa5dcced..50c1ff76 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -20,16 +20,18 @@ class DoclingParsePageBackend(PdfPageBackend): self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage ): self._ppage = page_obj - parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) self._dpage = None - self.broken_page = "pages" not in parsed_page - if not self.broken_page: + self.valid = "pages" in parsed_page + if self.valid: self._dpage = parsed_page["pages"][0] + def is_valid(self) -> bool: + return self.valid + def get_text_in_rect(self, bbox: BoundingBox) -> str: - if self.broken_page: + if not self.valid: return "" # Find intersecting cells on the page text_piece = "" @@ -65,7 +67,7 @@ class DoclingParsePageBackend(PdfPageBackend): cells = [] cell_counter = 0 - if self.broken_page: + if not self.valid: return cells page_size = self.get_size() diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index adf0b63c..6715339f 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -16,6 +16,10 @@ class PyPdfiumPageBackend(PdfPageBackend): def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int): self._ppage: pdfium.PdfPage = pdfium_doc[page_no] self.text_page = None + self.valid = True + + def is_valid(self) -> bool: + return self.valid def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]: AREA_THRESHOLD = 32 * 32