finalize linter fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-04-14 16:16:02 +02:00
parent ad28271b4a
commit 1b5337abf9
9 changed files with 35 additions and 63 deletions

View File

@ -9,48 +9,18 @@ repos:
args: [--config=pyproject.toml] args: [--config=pyproject.toml]
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$' files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
# Run the Ruff linter. # Run the Ruff linter.
# - id: ruff - id: ruff
# name: "Ruff linter" name: "Ruff linter"
# args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml] args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
# files: '^(docling|tests|docs/examples).*\.(py|ipynb)$' files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
- repo: local - repo: local
hooks: hooks:
# - id: black
# name: Black
# entry: poetry run black docling docs/examples tests
# pass_filenames: false
# language: system
# files: '\.py$'
# - id: isort
# name: isort
# entry: poetry run isort docling docs/examples tests
# pass_filenames: false
# language: system
# files: '\.py$'
# - id: flake8
# name: flake8
# entry: poetry run flake8 docling
# pass_filenames: false
# language: system
# files: '\.py$'
- id: mypy - id: mypy
name: MyPy name: MyPy
entry: poetry run mypy docling entry: poetry run mypy docling
pass_filenames: false pass_filenames: false
language: system language: system
files: '\.py$' files: '\.py$'
# - id: nbqa_black
# name: nbQA Black
# entry: poetry run nbqa black docs/examples
# pass_filenames: false
# language: system
# files: '\.ipynb$'
# - id: nbqa_isort
# name: nbQA isort
# entry: poetry run nbqa isort docs/examples
# pass_filenames: false
# language: system
# files: '\.ipynb$'
- id: poetry - id: poetry
name: Poetry check name: Poetry check
entry: poetry check --lock entry: poetry check --lock

View File

@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
# _log.info(decoded_data) # _log.info(decoded_data)
# Read the TSV file generated by Tesseract # Read the TSV file generated by Tesseract
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t") df_result = pd.read_csv(
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
)
# Display the dataframe (optional) # Display the dataframe (optional)
# _log.info("df: ", df.head()) # _log.info("df: ", df.head())
# Filter rows that contain actual text (ignore header or empty rows) # Filter rows that contain actual text (ignore header or empty rows)
df_filtered = df[ df_filtered = df_result[
df["text"].notnull() & (df["text"].apply(str).str.strip() != "") df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
] ]
return df_filtered return df_filtered
@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate() output, _ = proc.communicate()
decoded_data = output.decode("utf-8") decoded_data = output.decode("utf-8")
df = pd.read_csv( df_detected = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"] io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
) )
scripts = df.loc[df["key"] == "Script"].value.tolist() scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
if len(scripts) == 0: if len(scripts) == 0:
_log.warning("Tesseract cannot detect the script of the page") _log.warning("Tesseract cannot detect the script of the page")
return None return None
@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate() output, _ = proc.communicate()
decoded_data = output.decode("utf-8") decoded_data = output.decode("utf-8")
df = pd.read_csv(io.StringIO(decoded_data), header=None) df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df[0].tolist()[1:] self._tesseract_languages = df_list[0].tolist()[1:]
# Decide the script prefix # Decide the script prefix
if any(l.startswith("script/") for l in self._tesseract_languages): if any(lang.startswith("script/") for lang in self._tesseract_languages):
script_prefix = "script/" script_prefix = "script/"
else: else:
script_prefix = "" script_prefix = ""
@ -224,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
fname = image_file.name fname = image_file.name
high_res_image.save(image_file) high_res_image.save(image_file)
df = self._run_tesseract(fname) df_result = self._run_tesseract(fname)
finally: finally:
if os.path.exists(fname): if os.path.exists(fname):
os.remove(fname) os.remove(fname)
# _log.info(df) # _log.info(df_result)
# Print relevant columns (bounding box and text) # Print relevant columns (bounding box and text)
for ix, row in df.iterrows(): for ix, row in df_result.iterrows():
text = row["text"] text = row["text"]
conf = row["conf"] conf = row["conf"]
l = float(row["left"]) l = float(row["left"]) # noqa: E741
b = float(row["top"]) b = float(row["top"])
w = float(row["width"]) w = float(row["width"])
h = float(row["height"]) h = float(row["height"])

View File

@ -38,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
self.options: TesseractOcrOptions self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.enabled: if self.enabled:
install_errmsg = ( install_errmsg = (
@ -76,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
_log.debug("Initializing TesserOCR: %s", tesseract_version) _log.debug("Initializing TesserOCR: %s", tesseract_version)
lang = "+".join(self.options.lang) lang = "+".join(self.options.lang)
if any(l.startswith("script/") for l in self._tesserocr_languages): if any(lang.startswith("script/") for lang in self._tesserocr_languages):
self.script_prefix = "script/" self.script_prefix = "script/"
else: else:
self.script_prefix = "" self.script_prefix = ""
@ -87,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
"oem": tesserocr.OEM.DEFAULT, "oem": tesserocr.OEM.DEFAULT,
} }
self.reader = None
self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.options.path is not None: if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path tesserocr_kwargs["path"] = self.options.path

View File

@ -29,7 +29,7 @@ def resolve_item(paths, obj):
try: try:
key = int(paths[0]) key = int(paths[0])
except: except Exception:
key = paths[0] key = paths[0]
if len(paths) == 1: if len(paths) == 1:

View File

@ -80,10 +80,10 @@ def main():
) )
# Generate one parquet from all documents # Generate one parquet from all documents
df = pd.json_normalize(rows) df_result = pd.json_normalize(rows)
now = datetime.datetime.now() now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet" output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename) df_result.to_parquet(output_filename)
end_time = time.time() - start_time end_time = time.time() - start_time

View File

@ -2,6 +2,9 @@ import json
import time import time
from pathlib import Path from pathlib import Path
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
VlmPipelineOptions, VlmPipelineOptions,
@ -33,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
## Alternative VLM models: ## Alternative VLM models:
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options # pipeline_options.vlm_options = granite_vision_vlm_conversion_options
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
## Set up pipeline for PDF or image inputs ## Set up pipeline for PDF or image inputs
converter = DocumentConverter( converter = DocumentConverter(
format_options={ format_options={

View File

@ -283,7 +283,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -369,7 +369,7 @@
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n", " new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
" try:\n", " try:\n",
" index_client.delete_index(index_name)\n", " index_client.delete_index(index_name)\n",
" except:\n", " except Exception:\n",
" pass\n", " pass\n",
"\n", "\n",
" index_client.create_or_update_index(new_index)\n", " index_client.create_or_update_index(new_index)\n",

View File

@ -59,7 +59,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true, "collapsed": true,
"id": "u076oUSF_YUG" "id": "u076oUSF_YUG"
@ -72,12 +72,11 @@
"%pip install rich\n", "%pip install rich\n",
"%pip install torch\n", "%pip install torch\n",
"\n", "\n",
"import logging\n",
"import warnings\n", "import warnings\n",
"\n", "\n",
"warnings.filterwarnings(\"ignore\")\n", "warnings.filterwarnings(\"ignore\")\n",
"\n", "\n",
"import logging\n",
"\n",
"# Suppress Weaviate client logs\n", "# Suppress Weaviate client logs\n",
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)" "logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
] ]

View File

@ -122,8 +122,8 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool):
"document has different count of tables than expected." "document has different count of tables than expected."
) )
for l, true_item in enumerate(doc_true.tables): for ix, true_item in enumerate(doc_true.tables):
pred_item = doc_pred.tables[l] pred_item = doc_pred.tables[ix]
assert true_item.num_rows == pred_item.num_rows, ( assert true_item.num_rows == pred_item.num_rows, (
"table does not have the same #-rows" "table does not have the same #-rows"