From ca9f940c8f1a69b3439fa2e0fa10d31d29b530b9 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 18 Oct 2024 10:11:58 +0200 Subject: [PATCH] feat: add coverage_threshold to skip OCR for small images Signed-off-by: Michele Dolfi --- docling/datamodel/pipeline_options.py | 3 +++ docling/models/base_ocr_model.py | 4 ++++ tests/test_options.py | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 637b0c0e..6bc36eb7 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -22,6 +22,9 @@ class TableStructureOptions(BaseModel): class OcrOptions(BaseModel): kind: str + coverage_threshold: float = ( + 0.3 # percentage of the area which must be covered by bitmaps for triggering OCR + ) class EasyOcrOptions(OcrOptions): diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 59ae2295..2b40320f 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -68,6 +68,10 @@ class BaseOcrModel: bitmap_rects = [] coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects) + # skip OCR if the bitmap area on the page is smaller than the options threshold + if coverage < self.options.coverage_threshold: + return [] + # return full-page rectangle if sufficiently covered with bitmaps if coverage > BITMAP_COVERAGE_TRESHOLD: return [ diff --git a/tests/test_options.py b/tests/test_options.py index ad6c7a45..c67409ea 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -42,3 +42,23 @@ def test_e2e_conversions(test_doc_path): doc_result: ConversionResult = converter.convert(test_doc_path) assert doc_result.status == ConversionStatus.SUCCESS + + +def test_ocr_coverage_threshold(test_doc_path): + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.ocr_options.coverage_threshold = 1.1 + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ) + } + ) + + test_doc_path = Path("./tests/data_scanned/ocr_test.pdf") + doc_result: ConversionResult = converter.convert(test_doc_path) + + # this should have generated no results, since we set a very high threshold + assert len(doc_result.document.texts) == 0