finalize linter fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-04-14 16:16:02 +02:00 · 2025-04-14 16:16:02 +02:00 · 1b5337abf9
commit 1b5337abf9
parent ad28271b4a
9 changed files with 35 additions and 63 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -9,48 +9,18 @@ repos:
        args: [--config=pyproject.toml]
        files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
      # Run the Ruff linter.
-      # - id: ruff
-      #   name: "Ruff linter"
-      #   args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
-      #   files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
+      - id: ruff
+        name: "Ruff linter"
+        args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
+        files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
  - repo: local
    hooks:
-      # - id: black
-      #   name: Black
-      #   entry: poetry run black docling docs/examples tests
-      #   pass_filenames: false
-      #   language: system
-      #   files: '\.py$'
-      # - id: isort
-      #   name: isort
-      #   entry: poetry run isort docling docs/examples tests
-      #   pass_filenames: false
-      #   language: system
-      #   files: '\.py$'
-#      - id: flake8
-#        name: flake8
-#        entry: poetry run flake8 docling
-#        pass_filenames: false
-#        language: system
-#        files: '\.py$'
      - id: mypy
        name: MyPy
        entry: poetry run mypy docling
        pass_filenames: false
        language: system
        files: '\.py$'
-      # - id: nbqa_black
-      #   name: nbQA Black
-      #   entry: poetry run nbqa black docs/examples
-      #   pass_filenames: false
-      #   language: system
-      #   files: '\.ipynb$'
-      # - id: nbqa_isort
-      #   name: nbQA isort
-      #   entry: poetry run nbqa isort docs/examples
-      #   pass_filenames: false
-      #   language: system
-      #   files: '\.ipynb$'
      - id: poetry
        name: Poetry check
        entry: poetry check --lock
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
        # _log.info(decoded_data)

        # Read the TSV file generated by Tesseract
-        df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
+        df_result = pd.read_csv(
+            io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
+        )

        # Display the dataframe (optional)
        # _log.info("df: ", df.head())

        # Filter rows that contain actual text (ignore header or empty rows)
-        df_filtered = df[
-            df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
+        df_filtered = df_result[
+            df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
        ]

        return df_filtered
@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
        output, _ = proc.communicate()
        decoded_data = output.decode("utf-8")
-        df = pd.read_csv(
+        df_detected = pd.read_csv(
            io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
        )
-        scripts = df.loc[df["key"] == "Script"].value.tolist()
+        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
        if len(scripts) == 0:
            _log.warning("Tesseract cannot detect the script of the page")
            return None
@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
        output, _ = proc.communicate()
        decoded_data = output.decode("utf-8")
-        df = pd.read_csv(io.StringIO(decoded_data), header=None)
-        self._tesseract_languages = df[0].tolist()[1:]
+        df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
+        self._tesseract_languages = df_list[0].tolist()[1:]

        # Decide the script prefix
-        if any(l.startswith("script/") for l in self._tesseract_languages):
+        if any(lang.startswith("script/") for lang in self._tesseract_languages):
            script_prefix = "script/"
        else:
            script_prefix = ""
@ -224,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
                                fname = image_file.name
                                high_res_image.save(image_file)

-                            df = self._run_tesseract(fname)
+                            df_result = self._run_tesseract(fname)
                        finally:
                            if os.path.exists(fname):
                                os.remove(fname)

-                        # _log.info(df)
+                        # _log.info(df_result)

                        # Print relevant columns (bounding box and text)
-                        for ix, row in df.iterrows():
+                        for ix, row in df_result.iterrows():
                            text = row["text"]
                            conf = row["conf"]

-                            l = float(row["left"])
+                            l = float(row["left"])  # noqa: E741
                            b = float(row["top"])
                            w = float(row["width"])
                            h = float(row["height"])
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -38,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
        self.options: TesseractOcrOptions

        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
-        self.reader = None
-        self.osd_reader = None
-        self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}

        if self.enabled:
            install_errmsg = (
@ -76,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
            _log.debug("Initializing TesserOCR: %s", tesseract_version)
            lang = "+".join(self.options.lang)

-            if any(l.startswith("script/") for l in self._tesserocr_languages):
+            if any(lang.startswith("script/") for lang in self._tesserocr_languages):
                self.script_prefix = "script/"
            else:
                self.script_prefix = ""
@ -87,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
                "oem": tesserocr.OEM.DEFAULT,
            }

+            self.reader = None
+            self.osd_reader = None
+            self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
+
            if self.options.path is not None:
                tesserocr_kwargs["path"] = self.options.path

--- a/docling/utils/glm_utils.py
+++ b/docling/utils/glm_utils.py
@ -29,7 +29,7 @@ def resolve_item(paths, obj):

    try:
        key = int(paths[0])
-    except:
+    except Exception:
        key = paths[0]

    if len(paths) == 1:
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@ -80,10 +80,10 @@ def main():
        )

    # Generate one parquet from all documents
-    df = pd.json_normalize(rows)
+    df_result = pd.json_normalize(rows)
    now = datetime.datetime.now()
    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
-    df.to_parquet(output_filename)
+    df_result.to_parquet(output_filename)

    end_time = time.time() - start_time

--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -2,6 +2,9 @@ import json
 import time
 from pathlib import Path

+from docling_core.types.doc import DocItemLabel, ImageRefMode
+from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
+
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
@ -33,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
 ## Alternative VLM models:
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options

-from docling_core.types.doc import DocItemLabel, ImageRefMode
-from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
-
 ## Set up pipeline for PDF or image inputs
 converter = DocumentConverter(
    format_options={
--- a/docs/examples/rag_azuresearch.ipynb
+++ b/docs/examples/rag_azuresearch.ipynb
@ -283,7 +283,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@ -369,7 +369,7 @@
    "    new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
    "    try:\n",
    "        index_client.delete_index(index_name)\n",
-    "    except:\n",
+    "    except Exception:\n",
    "        pass\n",
    "\n",
    "    index_client.create_or_update_index(new_index)\n",
--- a/docs/examples/rag_weaviate.ipynb
+++ b/docs/examples/rag_weaviate.ipynb
@ -59,7 +59,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "id": "u076oUSF_YUG"
@ -72,12 +72,11 @@
    "%pip install rich\n",
    "%pip install torch\n",
    "\n",
+    "import logging\n",
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
-    "import logging\n",
-    "\n",
    "# Suppress Weaviate client logs\n",
    "logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
   ]
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -122,8 +122,8 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool):
        "document has different count of tables than expected."
    )

-    for l, true_item in enumerate(doc_true.tables):
-        pred_item = doc_pred.tables[l]
+    for ix, true_item in enumerate(doc_true.tables):
+        pred_item = doc_pred.tables[ix]

        assert true_item.num_rows == pred_item.num_rows, (
            "table does not have the same #-rows"