mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Introduce page-level error checks
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
cae20ac099
commit
21f977544c
@ -30,6 +30,10 @@ class PdfPageBackend(ABC):
|
|||||||
def get_size(self) -> "PageSize":
|
def get_size(self) -> "PageSize":
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def unload(self):
|
def unload(self):
|
||||||
pass
|
pass
|
||||||
|
@ -20,16 +20,18 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|||||||
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
|
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
|
||||||
):
|
):
|
||||||
self._ppage = page_obj
|
self._ppage = page_obj
|
||||||
|
|
||||||
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
||||||
|
|
||||||
self._dpage = None
|
self._dpage = None
|
||||||
self.broken_page = "pages" not in parsed_page
|
self.valid = "pages" in parsed_page
|
||||||
if not self.broken_page:
|
if self.valid:
|
||||||
self._dpage = parsed_page["pages"][0]
|
self._dpage = parsed_page["pages"][0]
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return self.valid
|
||||||
|
|
||||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||||
if self.broken_page:
|
if not self.valid:
|
||||||
return ""
|
return ""
|
||||||
# Find intersecting cells on the page
|
# Find intersecting cells on the page
|
||||||
text_piece = ""
|
text_piece = ""
|
||||||
@ -65,7 +67,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|||||||
cells = []
|
cells = []
|
||||||
cell_counter = 0
|
cell_counter = 0
|
||||||
|
|
||||||
if self.broken_page:
|
if not self.valid:
|
||||||
return cells
|
return cells
|
||||||
|
|
||||||
page_size = self.get_size()
|
page_size = self.get_size()
|
||||||
|
@ -16,6 +16,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int):
|
def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int):
|
||||||
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
||||||
self.text_page = None
|
self.text_page = None
|
||||||
|
self.valid = True
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return self.valid
|
||||||
|
|
||||||
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
|
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
|
||||||
AREA_THRESHOLD = 32 * 32
|
AREA_THRESHOLD = 32 * 32
|
||||||
|
Loading…
Reference in New Issue
Block a user