finalize linter fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-04-14 16:16:02 +02:00
parent ad28271b4a
commit 1b5337abf9
9 changed files with 35 additions and 63 deletions

View File

@ -9,48 +9,18 @@ repos:
args: [--config=pyproject.toml]
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
# Run the Ruff linter.
# - id: ruff
# name: "Ruff linter"
# args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
# files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
- id: ruff
name: "Ruff linter"
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
- repo: local
hooks:
# - id: black
# name: Black
# entry: poetry run black docling docs/examples tests
# pass_filenames: false
# language: system
# files: '\.py$'
# - id: isort
# name: isort
# entry: poetry run isort docling docs/examples tests
# pass_filenames: false
# language: system
# files: '\.py$'
# - id: flake8
# name: flake8
# entry: poetry run flake8 docling
# pass_filenames: false
# language: system
# files: '\.py$'
- id: mypy
name: MyPy
entry: poetry run mypy docling
pass_filenames: false
language: system
files: '\.py$'
# - id: nbqa_black
# name: nbQA Black
# entry: poetry run nbqa black docs/examples
# pass_filenames: false
# language: system
# files: '\.ipynb$'
# - id: nbqa_isort
# name: nbQA isort
# entry: poetry run nbqa isort docs/examples
# pass_filenames: false
# language: system
# files: '\.ipynb$'
- id: poetry
name: Poetry check
entry: poetry check --lock

View File

@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
# _log.info(decoded_data)
# Read the TSV file generated by Tesseract
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
df_result = pd.read_csv(
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
)
# Display the dataframe (optional)
# _log.info("df: ", df.head())
# Filter rows that contain actual text (ignore header or empty rows)
df_filtered = df[
df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
df_filtered = df_result[
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
]
return df_filtered
@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
df = pd.read_csv(
df_detected = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
)
scripts = df.loc[df["key"] == "Script"].value.tolist()
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
if len(scripts) == 0:
_log.warning("Tesseract cannot detect the script of the page")
return None
@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
df = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df[0].tolist()[1:]
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df_list[0].tolist()[1:]
# Decide the script prefix
if any(l.startswith("script/") for l in self._tesseract_languages):
if any(lang.startswith("script/") for lang in self._tesseract_languages):
script_prefix = "script/"
else:
script_prefix = ""
@ -224,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
fname = image_file.name
high_res_image.save(image_file)
df = self._run_tesseract(fname)
df_result = self._run_tesseract(fname)
finally:
if os.path.exists(fname):
os.remove(fname)
# _log.info(df)
# _log.info(df_result)
# Print relevant columns (bounding box and text)
for ix, row in df.iterrows():
for ix, row in df_result.iterrows():
text = row["text"]
conf = row["conf"]
l = float(row["left"])
l = float(row["left"]) # noqa: E741
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])

View File

@ -38,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.enabled:
install_errmsg = (
@ -76,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
_log.debug("Initializing TesserOCR: %s", tesseract_version)
lang = "+".join(self.options.lang)
if any(l.startswith("script/") for l in self._tesserocr_languages):
if any(lang.startswith("script/") for lang in self._tesserocr_languages):
self.script_prefix = "script/"
else:
self.script_prefix = ""
@ -87,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
"oem": tesserocr.OEM.DEFAULT,
}
self.reader = None
self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path

View File

@ -29,7 +29,7 @@ def resolve_item(paths, obj):
try:
key = int(paths[0])
except:
except Exception:
key = paths[0]
if len(paths) == 1:

View File

@ -80,10 +80,10 @@ def main():
)
# Generate one parquet from all documents
df = pd.json_normalize(rows)
df_result = pd.json_normalize(rows)
now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename)
df_result.to_parquet(output_filename)
end_time = time.time() - start_time

View File

@ -2,6 +2,9 @@ import json
import time
from pathlib import Path
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
VlmPipelineOptions,
@ -33,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
## Alternative VLM models:
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
## Set up pipeline for PDF or image inputs
converter = DocumentConverter(
format_options={

View File

@ -283,7 +283,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {},
"outputs": [
{
@ -369,7 +369,7 @@
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
" try:\n",
" index_client.delete_index(index_name)\n",
" except:\n",
" except Exception:\n",
" pass\n",
"\n",
" index_client.create_or_update_index(new_index)\n",

View File

@ -59,7 +59,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {
"collapsed": true,
"id": "u076oUSF_YUG"
@ -72,12 +72,11 @@
"%pip install rich\n",
"%pip install torch\n",
"\n",
"import logging\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"import logging\n",
"\n",
"# Suppress Weaviate client logs\n",
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
]

View File

@ -122,8 +122,8 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool):
"document has different count of tables than expected."
)
for l, true_item in enumerate(doc_true.tables):
pred_item = doc_pred.tables[l]
for ix, true_item in enumerate(doc_true.tables):
pred_item = doc_pred.tables[ix]
assert true_item.num_rows == pred_item.num_rows, (
"table does not have the same #-rows"