mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Introduce page-level error checks
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
cae20ac099
commit
21f977544c
@ -30,6 +30,10 @@ class PdfPageBackend(ABC):
|
||||
def get_size(self) -> "PageSize":
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
pass
|
||||
|
@ -20,16 +20,18 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
|
||||
):
|
||||
self._ppage = page_obj
|
||||
|
||||
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
||||
|
||||
self._dpage = None
|
||||
self.broken_page = "pages" not in parsed_page
|
||||
if not self.broken_page:
|
||||
self.valid = "pages" in parsed_page
|
||||
if self.valid:
|
||||
self._dpage = parsed_page["pages"][0]
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
if self.broken_page:
|
||||
if not self.valid:
|
||||
return ""
|
||||
# Find intersecting cells on the page
|
||||
text_piece = ""
|
||||
@ -65,7 +67,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
cells = []
|
||||
cell_counter = 0
|
||||
|
||||
if self.broken_page:
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
@ -16,6 +16,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int):
|
||||
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
||||
self.text_page = None
|
||||
self.valid = True
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 32 * 32
|
||||
|
Loading…
Reference in New Issue
Block a user