From 6faff146e0c3aaa04b8b6cae7d8945bb939f92d6 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Mon, 7 Oct 2024 17:01:47 +0200 Subject: [PATCH] fix(OCR): Skip zero area OCR cells for all OCR engines Signed-off-by: Nikos Livathinos --- docling/models/easyocr_model.py | 3 +++ docling/models/tesseract_model.py | 3 +++ docling/models/tesserocr_model.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index c36c6657..a4c64a78 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -43,6 +43,9 @@ class EasyOcrModel(BaseOcrModel): all_ocr_cells = [] for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect ) diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py index 980e60bc..fde8d770 100644 --- a/docling/models/tesseract_model.py +++ b/docling/models/tesseract_model.py @@ -108,6 +108,9 @@ class TesseractOcrModel(BaseOcrModel): all_ocr_cells = [] for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect ) diff --git a/docling/models/tesserocr_model.py b/docling/models/tesserocr_model.py index 07707115..a26c29e3 100644 --- a/docling/models/tesserocr_model.py +++ b/docling/models/tesserocr_model.py @@ -61,6 +61,9 @@ class TesserOcrModel(BaseOcrModel): all_ocr_cells = [] for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect )