diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b0db1a8d..041a100b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,48 +9,18 @@ repos: args: [--config=pyproject.toml] files: '^(docling|tests|docs/examples).*\.(py|ipynb)$' # Run the Ruff linter. - # - id: ruff - # name: "Ruff linter" - # args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml] - # files: '^(docling|tests|docs/examples).*\.(py|ipynb)$' + - id: ruff + name: "Ruff linter" + args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml] + files: '^(docling|tests|docs/examples).*\.(py|ipynb)$' - repo: local hooks: - # - id: black - # name: Black - # entry: poetry run black docling docs/examples tests - # pass_filenames: false - # language: system - # files: '\.py$' - # - id: isort - # name: isort - # entry: poetry run isort docling docs/examples tests - # pass_filenames: false - # language: system - # files: '\.py$' -# - id: flake8 -# name: flake8 -# entry: poetry run flake8 docling -# pass_filenames: false -# language: system -# files: '\.py$' - id: mypy name: MyPy entry: poetry run mypy docling pass_filenames: false language: system files: '\.py$' - # - id: nbqa_black - # name: nbQA Black - # entry: poetry run nbqa black docs/examples - # pass_filenames: false - # language: system - # files: '\.ipynb$' - # - id: nbqa_isort - # name: nbQA isort - # entry: poetry run nbqa isort docs/examples - # pass_filenames: false - # language: system - # files: '\.ipynb$' - id: poetry name: Poetry check entry: poetry check --lock diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 0467e70f..91b4555f 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel): # _log.info(decoded_data) # Read the TSV file generated by Tesseract - df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t") + df_result = pd.read_csv( + io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t" + ) # Display the dataframe (optional) # _log.info("df: ", df.head()) # Filter rows that contain actual text (ignore header or empty rows) - df_filtered = df[ - df["text"].notnull() & (df["text"].apply(str).str.strip() != "") + df_filtered = df_result[ + df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "") ] return df_filtered @@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel): proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) output, _ = proc.communicate() decoded_data = output.decode("utf-8") - df = pd.read_csv( + df_detected = pd.read_csv( io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"] ) - scripts = df.loc[df["key"] == "Script"].value.tolist() + scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist() if len(scripts) == 0: _log.warning("Tesseract cannot detect the script of the page") return None @@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel): proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) output, _ = proc.communicate() decoded_data = output.decode("utf-8") - df = pd.read_csv(io.StringIO(decoded_data), header=None) - self._tesseract_languages = df[0].tolist()[1:] + df_list = pd.read_csv(io.StringIO(decoded_data), header=None) + self._tesseract_languages = df_list[0].tolist()[1:] # Decide the script prefix - if any(l.startswith("script/") for l in self._tesseract_languages): + if any(lang.startswith("script/") for lang in self._tesseract_languages): script_prefix = "script/" else: script_prefix = "" @@ -224,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel): fname = image_file.name high_res_image.save(image_file) - df = self._run_tesseract(fname) + df_result = self._run_tesseract(fname) finally: if os.path.exists(fname): os.remove(fname) - # _log.info(df) + # _log.info(df_result) # Print relevant columns (bounding box and text) - for ix, row in df.iterrows(): + for ix, row in df_result.iterrows(): text = row["text"] conf = row["conf"] - l = float(row["left"]) + l = float(row["left"]) # noqa: E741 b = float(row["top"]) w = float(row["width"]) h = float(row["height"]) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 39f5b86c..fbe907cc 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -38,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel): self.options: TesseractOcrOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. - self.reader = None - self.osd_reader = None - self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {} if self.enabled: install_errmsg = ( @@ -76,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel): _log.debug("Initializing TesserOCR: %s", tesseract_version) lang = "+".join(self.options.lang) - if any(l.startswith("script/") for l in self._tesserocr_languages): + if any(lang.startswith("script/") for lang in self._tesserocr_languages): self.script_prefix = "script/" else: self.script_prefix = "" @@ -87,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel): "oem": tesserocr.OEM.DEFAULT, } + self.reader = None + self.osd_reader = None + self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {} + if self.options.path is not None: tesserocr_kwargs["path"] = self.options.path diff --git a/docling/utils/glm_utils.py b/docling/utils/glm_utils.py index 46ac0bce..b67281fa 100644 --- a/docling/utils/glm_utils.py +++ b/docling/utils/glm_utils.py @@ -29,7 +29,7 @@ def resolve_item(paths, obj): try: key = int(paths[0]) - except: + except Exception: key = paths[0] if len(paths) == 1: diff --git a/docs/examples/export_multimodal.py b/docs/examples/export_multimodal.py index a49999a2..bef74bfa 100644 --- a/docs/examples/export_multimodal.py +++ b/docs/examples/export_multimodal.py @@ -80,10 +80,10 @@ def main(): ) # Generate one parquet from all documents - df = pd.json_normalize(rows) + df_result = pd.json_normalize(rows) now = datetime.datetime.now() output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet" - df.to_parquet(output_filename) + df_result.to_parquet(output_filename) end_time = time.time() - start_time diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 5211fa44..fab63425 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -2,6 +2,9 @@ import json import time from pathlib import Path +from docling_core.types.doc import DocItemLabel, ImageRefMode +from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS + from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( VlmPipelineOptions, @@ -33,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options ## Alternative VLM models: # pipeline_options.vlm_options = granite_vision_vlm_conversion_options -from docling_core.types.doc import DocItemLabel, ImageRefMode -from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS - ## Set up pipeline for PDF or image inputs converter = DocumentConverter( format_options={ diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb index 6e77352d..b206069d 100644 --- a/docs/examples/rag_azuresearch.ipynb +++ b/docs/examples/rag_azuresearch.ipynb @@ -283,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -369,7 +369,7 @@ " new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n", " try:\n", " index_client.delete_index(index_name)\n", - " except:\n", + " except Exception:\n", " pass\n", "\n", " index_client.create_or_update_index(new_index)\n", diff --git a/docs/examples/rag_weaviate.ipynb b/docs/examples/rag_weaviate.ipynb index 7047f0fe..627e8927 100644 --- a/docs/examples/rag_weaviate.ipynb +++ b/docs/examples/rag_weaviate.ipynb @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": true, "id": "u076oUSF_YUG" @@ -72,12 +72,11 @@ "%pip install rich\n", "%pip install torch\n", "\n", + "import logging\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", - "import logging\n", - "\n", "# Suppress Weaviate client logs\n", "logging.getLogger(\"weaviate\").setLevel(logging.ERROR)" ] diff --git a/tests/verify_utils.py b/tests/verify_utils.py index ab3412e3..1a913c2c 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -122,8 +122,8 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool): "document has different count of tables than expected." ) - for l, true_item in enumerate(doc_true.tables): - pred_item = doc_pred.tables[l] + for ix, true_item in enumerate(doc_true.tables): + pred_item = doc_pred.tables[ix] assert true_item.num_rows == pred_item.num_rows, ( "table does not have the same #-rows"