Add type hints and fix mypy

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-25 19:44:34 +00:00 · 2025-06-13 11:08:55 +02:00 · 2025-06-13 11:08:55 +02:00 · e3cefd0e71
commit e3cefd0e71
parent 9469280802
1 changed files with 8 additions and 3 deletions
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -7,6 +7,7 @@ from typing import List, Optional, Type

 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import TextCell
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label
@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
            return []

    # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
-    def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
+    def _filter_ocr_cells(
+        self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
+    ) -> List[TextCell]:
        # Create R-tree index for programmatic cells
        p = index.Property()
        p.dimension = 2
@ -130,7 +133,7 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
        ]
        return filtered_ocr_cells

-    def post_process_cells(self, ocr_cells, page):
+    def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
        r"""
        Post-process the OCR cells and update the page object.
        Updates parsed_page.textline_cells directly since page.cells is now read-only.
@ -147,7 +150,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
        page.parsed_page.textline_cells = final_cells
        page.parsed_page.has_lines = len(final_cells) > 0

-    def _combine_cells(self, existing_cells, ocr_cells):
+    def _combine_cells(
+        self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
+    ) -> List[TextCell]:
        """Combine existing and OCR cells with filtering and re-indexing."""
        if self.options.force_full_page_ocr:
            combined = ocr_cells