diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index c36c6657..a4c64a78 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -43,6 +43,9 @@ class EasyOcrModel(BaseOcrModel): all_ocr_cells = [] for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect ) diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py index 980e60bc..fde8d770 100644 --- a/docling/models/tesseract_model.py +++ b/docling/models/tesseract_model.py @@ -108,6 +108,9 @@ class TesseractOcrModel(BaseOcrModel): all_ocr_cells = [] for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect ) diff --git a/docling/models/tesserocr_model.py b/docling/models/tesserocr_model.py index 07707115..a26c29e3 100644 --- a/docling/models/tesserocr_model.py +++ b/docling/models/tesserocr_model.py @@ -61,6 +61,9 @@ class TesserOcrModel(BaseOcrModel): all_ocr_cells = [] for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect )