mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
fix(OCR): Skip zero area OCR cells for all OCR engines
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
parent
a9b22a8694
commit
6faff146e0
@ -43,6 +43,9 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
all_ocr_cells = []
|
all_ocr_cells = []
|
||||||
for ocr_rect in ocr_rects:
|
for ocr_rect in ocr_rects:
|
||||||
|
# Skip zero area boxes
|
||||||
|
if ocr_rect.area() == 0:
|
||||||
|
continue
|
||||||
high_res_image = page._backend.get_page_image(
|
high_res_image = page._backend.get_page_image(
|
||||||
scale=self.scale, cropbox=ocr_rect
|
scale=self.scale, cropbox=ocr_rect
|
||||||
)
|
)
|
||||||
|
@ -108,6 +108,9 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
all_ocr_cells = []
|
all_ocr_cells = []
|
||||||
for ocr_rect in ocr_rects:
|
for ocr_rect in ocr_rects:
|
||||||
|
# Skip zero area boxes
|
||||||
|
if ocr_rect.area() == 0:
|
||||||
|
continue
|
||||||
high_res_image = page._backend.get_page_image(
|
high_res_image = page._backend.get_page_image(
|
||||||
scale=self.scale, cropbox=ocr_rect
|
scale=self.scale, cropbox=ocr_rect
|
||||||
)
|
)
|
||||||
|
@ -61,6 +61,9 @@ class TesserOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
all_ocr_cells = []
|
all_ocr_cells = []
|
||||||
for ocr_rect in ocr_rects:
|
for ocr_rect in ocr_rects:
|
||||||
|
# Skip zero area boxes
|
||||||
|
if ocr_rect.area() == 0:
|
||||||
|
continue
|
||||||
high_res_image = page._backend.get_page_image(
|
high_res_image = page._backend.get_page_image(
|
||||||
scale=self.scale, cropbox=ocr_rect
|
scale=self.scale, cropbox=ocr_rect
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user