mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
finalize linter fixes
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
ad28271b4a
commit
1b5337abf9
@ -9,48 +9,18 @@ repos:
|
||||
args: [--config=pyproject.toml]
|
||||
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
||||
# Run the Ruff linter.
|
||||
# - id: ruff
|
||||
# name: "Ruff linter"
|
||||
# args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
|
||||
# files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
||||
- id: ruff
|
||||
name: "Ruff linter"
|
||||
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
|
||||
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
||||
- repo: local
|
||||
hooks:
|
||||
# - id: black
|
||||
# name: Black
|
||||
# entry: poetry run black docling docs/examples tests
|
||||
# pass_filenames: false
|
||||
# language: system
|
||||
# files: '\.py$'
|
||||
# - id: isort
|
||||
# name: isort
|
||||
# entry: poetry run isort docling docs/examples tests
|
||||
# pass_filenames: false
|
||||
# language: system
|
||||
# files: '\.py$'
|
||||
# - id: flake8
|
||||
# name: flake8
|
||||
# entry: poetry run flake8 docling
|
||||
# pass_filenames: false
|
||||
# language: system
|
||||
# files: '\.py$'
|
||||
- id: mypy
|
||||
name: MyPy
|
||||
entry: poetry run mypy docling
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
# - id: nbqa_black
|
||||
# name: nbQA Black
|
||||
# entry: poetry run nbqa black docs/examples
|
||||
# pass_filenames: false
|
||||
# language: system
|
||||
# files: '\.ipynb$'
|
||||
# - id: nbqa_isort
|
||||
# name: nbQA isort
|
||||
# entry: poetry run nbqa isort docs/examples
|
||||
# pass_filenames: false
|
||||
# language: system
|
||||
# files: '\.ipynb$'
|
||||
- id: poetry
|
||||
name: Poetry check
|
||||
entry: poetry check --lock
|
||||
|
@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
# _log.info(decoded_data)
|
||||
|
||||
# Read the TSV file generated by Tesseract
|
||||
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
||||
df_result = pd.read_csv(
|
||||
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
|
||||
)
|
||||
|
||||
# Display the dataframe (optional)
|
||||
# _log.info("df: ", df.head())
|
||||
|
||||
# Filter rows that contain actual text (ignore header or empty rows)
|
||||
df_filtered = df[
|
||||
df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
|
||||
df_filtered = df_result[
|
||||
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
|
||||
]
|
||||
|
||||
return df_filtered
|
||||
@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||
output, _ = proc.communicate()
|
||||
decoded_data = output.decode("utf-8")
|
||||
df = pd.read_csv(
|
||||
df_detected = pd.read_csv(
|
||||
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
||||
)
|
||||
scripts = df.loc[df["key"] == "Script"].value.tolist()
|
||||
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
||||
if len(scripts) == 0:
|
||||
_log.warning("Tesseract cannot detect the script of the page")
|
||||
return None
|
||||
@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||
output, _ = proc.communicate()
|
||||
decoded_data = output.decode("utf-8")
|
||||
df = pd.read_csv(io.StringIO(decoded_data), header=None)
|
||||
self._tesseract_languages = df[0].tolist()[1:]
|
||||
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
||||
self._tesseract_languages = df_list[0].tolist()[1:]
|
||||
|
||||
# Decide the script prefix
|
||||
if any(l.startswith("script/") for l in self._tesseract_languages):
|
||||
if any(lang.startswith("script/") for lang in self._tesseract_languages):
|
||||
script_prefix = "script/"
|
||||
else:
|
||||
script_prefix = ""
|
||||
@ -224,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
fname = image_file.name
|
||||
high_res_image.save(image_file)
|
||||
|
||||
df = self._run_tesseract(fname)
|
||||
df_result = self._run_tesseract(fname)
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.remove(fname)
|
||||
|
||||
# _log.info(df)
|
||||
# _log.info(df_result)
|
||||
|
||||
# Print relevant columns (bounding box and text)
|
||||
for ix, row in df.iterrows():
|
||||
for ix, row in df_result.iterrows():
|
||||
text = row["text"]
|
||||
conf = row["conf"]
|
||||
|
||||
l = float(row["left"])
|
||||
l = float(row["left"]) # noqa: E741
|
||||
b = float(row["top"])
|
||||
w = float(row["width"])
|
||||
h = float(row["height"])
|
||||
|
@ -38,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
self.options: TesseractOcrOptions
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
self.reader = None
|
||||
self.osd_reader = None
|
||||
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
||||
|
||||
if self.enabled:
|
||||
install_errmsg = (
|
||||
@ -76,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
||||
lang = "+".join(self.options.lang)
|
||||
|
||||
if any(l.startswith("script/") for l in self._tesserocr_languages):
|
||||
if any(lang.startswith("script/") for lang in self._tesserocr_languages):
|
||||
self.script_prefix = "script/"
|
||||
else:
|
||||
self.script_prefix = ""
|
||||
@ -87,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
"oem": tesserocr.OEM.DEFAULT,
|
||||
}
|
||||
|
||||
self.reader = None
|
||||
self.osd_reader = None
|
||||
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
||||
|
||||
if self.options.path is not None:
|
||||
tesserocr_kwargs["path"] = self.options.path
|
||||
|
||||
|
@ -29,7 +29,7 @@ def resolve_item(paths, obj):
|
||||
|
||||
try:
|
||||
key = int(paths[0])
|
||||
except:
|
||||
except Exception:
|
||||
key = paths[0]
|
||||
|
||||
if len(paths) == 1:
|
||||
|
@ -80,10 +80,10 @@ def main():
|
||||
)
|
||||
|
||||
# Generate one parquet from all documents
|
||||
df = pd.json_normalize(rows)
|
||||
df_result = pd.json_normalize(rows)
|
||||
now = datetime.datetime.now()
|
||||
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
|
||||
df.to_parquet(output_filename)
|
||||
df_result.to_parquet(output_filename)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
|
@ -2,6 +2,9 @@ import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
VlmPipelineOptions,
|
||||
@ -33,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
|
||||
## Alternative VLM models:
|
||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||
|
||||
## Set up pipeline for PDF or image inputs
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
|
@ -283,7 +283,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -369,7 +369,7 @@
|
||||
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
|
||||
" try:\n",
|
||||
" index_client.delete_index(index_name)\n",
|
||||
" except:\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" index_client.create_or_update_index(new_index)\n",
|
||||
|
@ -59,7 +59,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"id": "u076oUSF_YUG"
|
||||
@ -72,12 +72,11 @@
|
||||
"%pip install rich\n",
|
||||
"%pip install torch\n",
|
||||
"\n",
|
||||
"import logging\n",
|
||||
"import warnings\n",
|
||||
"\n",
|
||||
"warnings.filterwarnings(\"ignore\")\n",
|
||||
"\n",
|
||||
"import logging\n",
|
||||
"\n",
|
||||
"# Suppress Weaviate client logs\n",
|
||||
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
|
||||
]
|
||||
|
@ -122,8 +122,8 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool):
|
||||
"document has different count of tables than expected."
|
||||
)
|
||||
|
||||
for l, true_item in enumerate(doc_true.tables):
|
||||
pred_item = doc_pred.tables[l]
|
||||
for ix, true_item in enumerate(doc_true.tables):
|
||||
pred_item = doc_pred.tables[ix]
|
||||
|
||||
assert true_item.num_rows == pred_item.num_rows, (
|
||||
"table does not have the same #-rows"
|
||||
|
Loading…
Reference in New Issue
Block a user