diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 56f2fa8f..efaa6ff8 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -22,7 +22,7 @@ class TableStructureOptions(BaseModel): class OcrOptions(BaseModel): kind: str - coverage_threshold: float = ( + bitmap_area_threshold: float = ( 0.05 # percentage of the area for a bitmap to processed with OCR ) diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 4b368695..da6860a8 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -69,7 +69,7 @@ class BaseOcrModel: coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects) # return full-page rectangle if sufficiently covered with bitmaps - if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.coverage_threshold): + if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold): return [ BoundingBox( l=0, @@ -87,7 +87,7 @@ class BaseOcrModel: rect for rect in ocr_rects if rect.area() / (page.size.width * page.size.height) - > self.options.coverage_threshold + > self.options.bitmap_area_threshold ] return ocr_rects diff --git a/tests/test_options.py b/tests/test_options.py index c67409ea..c53570cc 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -47,7 +47,7 @@ def test_e2e_conversions(test_doc_path): def test_ocr_coverage_threshold(test_doc_path): pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = True - pipeline_options.ocr_options.coverage_threshold = 1.1 + pipeline_options.ocr_options.bitmap_area_threshold = 1.1 converter = DocumentConverter( format_options={