mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
finalize linter fixes
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
ad28271b4a
commit
1b5337abf9
@ -9,48 +9,18 @@ repos:
|
|||||||
args: [--config=pyproject.toml]
|
args: [--config=pyproject.toml]
|
||||||
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
||||||
# Run the Ruff linter.
|
# Run the Ruff linter.
|
||||||
# - id: ruff
|
- id: ruff
|
||||||
# name: "Ruff linter"
|
name: "Ruff linter"
|
||||||
# args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
|
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
|
||||||
# files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
# - id: black
|
|
||||||
# name: Black
|
|
||||||
# entry: poetry run black docling docs/examples tests
|
|
||||||
# pass_filenames: false
|
|
||||||
# language: system
|
|
||||||
# files: '\.py$'
|
|
||||||
# - id: isort
|
|
||||||
# name: isort
|
|
||||||
# entry: poetry run isort docling docs/examples tests
|
|
||||||
# pass_filenames: false
|
|
||||||
# language: system
|
|
||||||
# files: '\.py$'
|
|
||||||
# - id: flake8
|
|
||||||
# name: flake8
|
|
||||||
# entry: poetry run flake8 docling
|
|
||||||
# pass_filenames: false
|
|
||||||
# language: system
|
|
||||||
# files: '\.py$'
|
|
||||||
- id: mypy
|
- id: mypy
|
||||||
name: MyPy
|
name: MyPy
|
||||||
entry: poetry run mypy docling
|
entry: poetry run mypy docling
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
language: system
|
language: system
|
||||||
files: '\.py$'
|
files: '\.py$'
|
||||||
# - id: nbqa_black
|
|
||||||
# name: nbQA Black
|
|
||||||
# entry: poetry run nbqa black docs/examples
|
|
||||||
# pass_filenames: false
|
|
||||||
# language: system
|
|
||||||
# files: '\.ipynb$'
|
|
||||||
# - id: nbqa_isort
|
|
||||||
# name: nbQA isort
|
|
||||||
# entry: poetry run nbqa isort docs/examples
|
|
||||||
# pass_filenames: false
|
|
||||||
# language: system
|
|
||||||
# files: '\.ipynb$'
|
|
||||||
- id: poetry
|
- id: poetry
|
||||||
name: Poetry check
|
name: Poetry check
|
||||||
entry: poetry check --lock
|
entry: poetry check --lock
|
||||||
|
@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
# _log.info(decoded_data)
|
# _log.info(decoded_data)
|
||||||
|
|
||||||
# Read the TSV file generated by Tesseract
|
# Read the TSV file generated by Tesseract
|
||||||
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
df_result = pd.read_csv(
|
||||||
|
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
|
||||||
|
)
|
||||||
|
|
||||||
# Display the dataframe (optional)
|
# Display the dataframe (optional)
|
||||||
# _log.info("df: ", df.head())
|
# _log.info("df: ", df.head())
|
||||||
|
|
||||||
# Filter rows that contain actual text (ignore header or empty rows)
|
# Filter rows that contain actual text (ignore header or empty rows)
|
||||||
df_filtered = df[
|
df_filtered = df_result[
|
||||||
df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
|
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
|
||||||
]
|
]
|
||||||
|
|
||||||
return df_filtered
|
return df_filtered
|
||||||
@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||||
output, _ = proc.communicate()
|
output, _ = proc.communicate()
|
||||||
decoded_data = output.decode("utf-8")
|
decoded_data = output.decode("utf-8")
|
||||||
df = pd.read_csv(
|
df_detected = pd.read_csv(
|
||||||
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
||||||
)
|
)
|
||||||
scripts = df.loc[df["key"] == "Script"].value.tolist()
|
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
||||||
if len(scripts) == 0:
|
if len(scripts) == 0:
|
||||||
_log.warning("Tesseract cannot detect the script of the page")
|
_log.warning("Tesseract cannot detect the script of the page")
|
||||||
return None
|
return None
|
||||||
@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||||
output, _ = proc.communicate()
|
output, _ = proc.communicate()
|
||||||
decoded_data = output.decode("utf-8")
|
decoded_data = output.decode("utf-8")
|
||||||
df = pd.read_csv(io.StringIO(decoded_data), header=None)
|
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
||||||
self._tesseract_languages = df[0].tolist()[1:]
|
self._tesseract_languages = df_list[0].tolist()[1:]
|
||||||
|
|
||||||
# Decide the script prefix
|
# Decide the script prefix
|
||||||
if any(l.startswith("script/") for l in self._tesseract_languages):
|
if any(lang.startswith("script/") for lang in self._tesseract_languages):
|
||||||
script_prefix = "script/"
|
script_prefix = "script/"
|
||||||
else:
|
else:
|
||||||
script_prefix = ""
|
script_prefix = ""
|
||||||
@ -224,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
fname = image_file.name
|
fname = image_file.name
|
||||||
high_res_image.save(image_file)
|
high_res_image.save(image_file)
|
||||||
|
|
||||||
df = self._run_tesseract(fname)
|
df_result = self._run_tesseract(fname)
|
||||||
finally:
|
finally:
|
||||||
if os.path.exists(fname):
|
if os.path.exists(fname):
|
||||||
os.remove(fname)
|
os.remove(fname)
|
||||||
|
|
||||||
# _log.info(df)
|
# _log.info(df_result)
|
||||||
|
|
||||||
# Print relevant columns (bounding box and text)
|
# Print relevant columns (bounding box and text)
|
||||||
for ix, row in df.iterrows():
|
for ix, row in df_result.iterrows():
|
||||||
text = row["text"]
|
text = row["text"]
|
||||||
conf = row["conf"]
|
conf = row["conf"]
|
||||||
|
|
||||||
l = float(row["left"])
|
l = float(row["left"]) # noqa: E741
|
||||||
b = float(row["top"])
|
b = float(row["top"])
|
||||||
w = float(row["width"])
|
w = float(row["width"])
|
||||||
h = float(row["height"])
|
h = float(row["height"])
|
||||||
|
@ -38,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
self.options: TesseractOcrOptions
|
self.options: TesseractOcrOptions
|
||||||
|
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
self.reader = None
|
|
||||||
self.osd_reader = None
|
|
||||||
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
install_errmsg = (
|
install_errmsg = (
|
||||||
@ -76,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
||||||
lang = "+".join(self.options.lang)
|
lang = "+".join(self.options.lang)
|
||||||
|
|
||||||
if any(l.startswith("script/") for l in self._tesserocr_languages):
|
if any(lang.startswith("script/") for lang in self._tesserocr_languages):
|
||||||
self.script_prefix = "script/"
|
self.script_prefix = "script/"
|
||||||
else:
|
else:
|
||||||
self.script_prefix = ""
|
self.script_prefix = ""
|
||||||
@ -87,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
"oem": tesserocr.OEM.DEFAULT,
|
"oem": tesserocr.OEM.DEFAULT,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.reader = None
|
||||||
|
self.osd_reader = None
|
||||||
|
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
||||||
|
|
||||||
if self.options.path is not None:
|
if self.options.path is not None:
|
||||||
tesserocr_kwargs["path"] = self.options.path
|
tesserocr_kwargs["path"] = self.options.path
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ def resolve_item(paths, obj):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
key = int(paths[0])
|
key = int(paths[0])
|
||||||
except:
|
except Exception:
|
||||||
key = paths[0]
|
key = paths[0]
|
||||||
|
|
||||||
if len(paths) == 1:
|
if len(paths) == 1:
|
||||||
|
@ -80,10 +80,10 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Generate one parquet from all documents
|
# Generate one parquet from all documents
|
||||||
df = pd.json_normalize(rows)
|
df_result = pd.json_normalize(rows)
|
||||||
now = datetime.datetime.now()
|
now = datetime.datetime.now()
|
||||||
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
|
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
|
||||||
df.to_parquet(output_filename)
|
df_result.to_parquet(output_filename)
|
||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
|
@ -2,6 +2,9 @@ import json
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||||
|
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
@ -33,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
|
|||||||
## Alternative VLM models:
|
## Alternative VLM models:
|
||||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
|
||||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
|
||||||
|
|
||||||
## Set up pipeline for PDF or image inputs
|
## Set up pipeline for PDF or image inputs
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
|
@ -283,7 +283,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -369,7 +369,7 @@
|
|||||||
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
|
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" index_client.delete_index(index_name)\n",
|
" index_client.delete_index(index_name)\n",
|
||||||
" except:\n",
|
" except Exception:\n",
|
||||||
" pass\n",
|
" pass\n",
|
||||||
"\n",
|
"\n",
|
||||||
" index_client.create_or_update_index(new_index)\n",
|
" index_client.create_or_update_index(new_index)\n",
|
||||||
|
@ -59,7 +59,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true,
|
"collapsed": true,
|
||||||
"id": "u076oUSF_YUG"
|
"id": "u076oUSF_YUG"
|
||||||
@ -72,12 +72,11 @@
|
|||||||
"%pip install rich\n",
|
"%pip install rich\n",
|
||||||
"%pip install torch\n",
|
"%pip install torch\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"import logging\n",
|
||||||
"import warnings\n",
|
"import warnings\n",
|
||||||
"\n",
|
"\n",
|
||||||
"warnings.filterwarnings(\"ignore\")\n",
|
"warnings.filterwarnings(\"ignore\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import logging\n",
|
|
||||||
"\n",
|
|
||||||
"# Suppress Weaviate client logs\n",
|
"# Suppress Weaviate client logs\n",
|
||||||
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
|
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
|
||||||
]
|
]
|
||||||
|
@ -122,8 +122,8 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool):
|
|||||||
"document has different count of tables than expected."
|
"document has different count of tables than expected."
|
||||||
)
|
)
|
||||||
|
|
||||||
for l, true_item in enumerate(doc_true.tables):
|
for ix, true_item in enumerate(doc_true.tables):
|
||||||
pred_item = doc_pred.tables[l]
|
pred_item = doc_pred.tables[ix]
|
||||||
|
|
||||||
assert true_item.num_rows == pred_item.num_rows, (
|
assert true_item.num_rows == pred_item.num_rows, (
|
||||||
"table does not have the same #-rows"
|
"table does not have the same #-rows"
|
||||||
|
Loading…
Reference in New Issue
Block a user