From ca9f940c8f1a69b3439fa2e0fa10d31d29b530b9 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Fri, 18 Oct 2024 10:11:58 +0200
Subject: [PATCH] feat: add coverage_threshold to skip OCR for small images

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py |  3 +++
 docling/models/base_ocr_model.py      |  4 ++++
 tests/test_options.py                 | 20 ++++++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 637b0c0e..6bc36eb7 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -22,6 +22,9 @@ class TableStructureOptions(BaseModel):
 
 class OcrOptions(BaseModel):
     kind: str
+    coverage_threshold: float = (
+        0.3  # percentage of the area which must be covered by bitmaps for triggering OCR
+    )
 
 
 class EasyOcrOptions(OcrOptions):
diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py
index 59ae2295..2b40320f 100644
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -68,6 +68,10 @@ class BaseOcrModel:
             bitmap_rects = []
         coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
 
+        # skip OCR if the bitmap area on the page is smaller than the options threshold
+        if coverage < self.options.coverage_threshold:
+            return []
+
         # return full-page rectangle if sufficiently covered with bitmaps
         if coverage > BITMAP_COVERAGE_TRESHOLD:
             return [
diff --git a/tests/test_options.py b/tests/test_options.py
index ad6c7a45..c67409ea 100644
--- a/tests/test_options.py
+++ b/tests/test_options.py
@@ -42,3 +42,23 @@ def test_e2e_conversions(test_doc_path):
         doc_result: ConversionResult = converter.convert(test_doc_path)
 
         assert doc_result.status == ConversionStatus.SUCCESS
+
+
+def test_ocr_coverage_threshold(test_doc_path):
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.ocr_options.coverage_threshold = 1.1
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+
+    test_doc_path = Path("./tests/data_scanned/ocr_test.pdf")
+    doc_result: ConversionResult = converter.convert(test_doc_path)
+
+    # this should have generated no results, since we set a very high threshold
+    assert len(doc_result.document.texts) == 0