fix mypy reports

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-08-02 07:22:14 +00:00 · 2025-02-06 15:55:46 +01:00 · 2025-02-06 15:55:46 +01:00 · 69e8a9d499
commit 69e8a9d499
parent fce6bb14db
5 changed files with 15 additions and 12 deletions
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@ -2,6 +2,7 @@ import re
 from pathlib import Path
 from typing import Iterable, List, Literal, Optional, Tuple, Union

+import numpy as np
 from docling_core.types.doc import (
    CodeItem,
    DocItemLabel,
@ -101,7 +102,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
                artifacts_path = Path(artifacts_path)

            self.code_formula_model = CodeFormulaPredictor(
-                artifacts_path=artifacts_path,
+                artifacts_path=str(artifacts_path),
                device=device,
                num_threads=accelerator_options.num_threads,
            )
@ -226,7 +227,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
            return

        labels: List[str] = []
-        images: List[Image.Image] = []
+        images: List[Union[Image.Image, np.ndarray]] = []
        elements: List[TextItem] = []
        for el in element_batch:
            assert isinstance(el.item, TextItem)
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Iterable, List, Literal, Optional, Tuple, Union

+import numpy as np
 from docling_core.types.doc import (
    DoclingDocument,
    NodeItem,
@ -93,7 +94,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
                artifacts_path = Path(artifacts_path)

            self.document_picture_classifier = DocumentFigureClassifierPredictor(
-                artifacts_path=artifacts_path,
+                artifacts_path=str(artifacts_path),
                device=device,
                num_threads=accelerator_options.num_threads,
            )
@ -159,7 +160,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
                yield element
            return

-        images: List[Image.Image] = []
+        images: List[Union[Image.Image, np.ndarray]] = []
        elements: List[PictureItem] = []
        for el in element_batch:
            assert isinstance(el, PictureItem)
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -106,10 +106,12 @@ class LayoutModel(BasePageModel):
            else:
                with TimeRecorder(conv_res, "layout"):
                    assert page.size is not None
+                    page_image = page.get_image(scale=1.0)
+                    assert page_image is not None

                    clusters = []
                    for ix, pred_item in enumerate(
-                        self.layout_predictor.predict(page.get_image(scale=1.0))
+                        self.layout_predictor.predict(page_image)
                    ):
                        label = DocItemLabel(
                            pred_item["label"]
--- a/poetry.lock
+++ b/poetry.lock
@ -282,18 +282,17 @@ testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-ch

 [[package]]
 name = "beautifulsoup4"
-version = "4.13.3"
+version = "4.12.3"
 description = "Screen-scraping library"
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.6.0"
 files = [
-    {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
-    {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
+    {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
+    {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
 ]

 [package.dependencies]
 soupsieve = ">1.2"
-typing-extensions = ">=4.0.0"

 [package.extras]
 cchardet = ["cchardet"]
@ -7815,4 +7814,4 @@ tesserocr = ["tesserocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "dcab39f8f6cc4a2b24e25774c147dce5eb9da775309d69a9304b72c47725b021"
+content-hash = "241c144d17dc30e30dc3c40cbbaeffd5e03a70c274b86be5ee04089a737fac1e"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -47,7 +47,7 @@ scipy = [
 typer = "^0.12.5"
 python-docx = "^1.1.2"
 python-pptx = "^1.0.2"
-beautifulsoup4 = "^4.12.3"
+beautifulsoup4 = ">=4.12.3,<4.13.0"
 pandas = "^2.1.4"
 marko = "^2.1.2"
 openpyxl = "^3.1.5"