ci: add coverage and ruff (#1383)

* add coverage calculation and push

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* new codecov version and usage of token

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* enable ruff formatter instead of black and isort

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply ruff lint fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply ruff unsafe fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add removed imports

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* runs 1 on linter issues

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* finalize linter fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Update pyproject.toml

Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Michele Dolfi
2025-04-14 18:01:26 +02:00
committed by GitHub
parent 293c28ca7c
commit 5458a88464
104 changed files with 665 additions and 633 deletions

View File

@@ -1,8 +1,8 @@
import logging
from typing import Any, Dict, Iterable, List, Tuple, Union
from collections.abc import Iterable
from typing import Any, Dict, List, Tuple, Union
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.document import ConversionResult, Page
@@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
def generate_multimodal_pages(
doc_result: ConversionResult,
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
label_to_doclaynet = {
"title": "title",
"table-of-contents": "document_index",
@@ -122,7 +121,6 @@ def generate_multimodal_pages(
if doc.main_text is None:
return
for ix, orig_item in enumerate(doc.main_text):
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
if item is None or item.prov is None or len(item.prov) == 0:
_log.debug(f"Skipping item {orig_item}")

View File

@@ -29,7 +29,7 @@ def resolve_item(paths, obj):
try:
key = int(paths[0])
except:
except Exception:
key = paths[0]
if len(paths) == 1:
@@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
return unique_objects
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
origin = DocumentOrigin(
mimetype="application/pdf",
filename=doc_glm["file-info"]["filename"],

View File

@@ -18,7 +18,7 @@ class UnionFind:
def __init__(self, elements):
self.parent = {elem: elem for elem in elements}
self.rank = {elem: 0 for elem in elements}
self.rank = dict.fromkeys(elements, 0)
def find(self, x):
if self.parent[x] != x:
@@ -484,7 +484,9 @@ class LayoutPostprocessor:
spatial_index = (
self.regular_index
if cluster_type == "regular"
else self.picture_index if cluster_type == "picture" else self.wrapper_index
else self.picture_index
if cluster_type == "picture"
else self.wrapper_index
)
# Map of currently valid clusters

View File

@@ -37,7 +37,7 @@ def download_models(
output_dir.mkdir(exist_ok=True, parents=True)
if with_layout:
_log.info(f"Downloading layout model...")
_log.info("Downloading layout model...")
LayoutModel.download_models(
local_dir=output_dir / LayoutModel._model_repo_folder,
force=force,
@@ -45,7 +45,7 @@ def download_models(
)
if with_tableformer:
_log.info(f"Downloading tableformer model...")
_log.info("Downloading tableformer model...")
TableStructureModel.download_models(
local_dir=output_dir / TableStructureModel._model_repo_folder,
force=force,
@@ -53,7 +53,7 @@ def download_models(
)
if with_picture_classifier:
_log.info(f"Downloading picture classifier model...")
_log.info("Downloading picture classifier model...")
DocumentPictureClassifier.download_models(
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
force=force,
@@ -61,7 +61,7 @@ def download_models(
)
if with_code_formula:
_log.info(f"Downloading code formula model...")
_log.info("Downloading code formula model...")
CodeFormulaModel.download_models(
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
force=force,
@@ -69,7 +69,7 @@ def download_models(
)
if with_smolvlm:
_log.info(f"Downloading SmolVlm model...")
_log.info("Downloading SmolVlm model...")
PictureDescriptionVlmModel.download_models(
repo_id=smolvlm_picture_description.repo_id,
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
@@ -78,7 +78,7 @@ def download_models(
)
if with_granite_vision:
_log.info(f"Downloading Granite Vision model...")
_log.info("Downloading Granite Vision model...")
PictureDescriptionVlmModel.download_models(
repo_id=granite_picture_description.repo_id,
local_dir=output_dir / granite_picture_description.repo_cache_folder,
@@ -87,7 +87,7 @@ def download_models(
)
if with_easyocr:
_log.info(f"Downloading easyocr models...")
_log.info("Downloading easyocr models...")
EasyOcrModel.download_models(
local_dir=output_dir / EasyOcrModel._model_repo_folder,
force=force,

View File

@@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
if isinstance(iterator, List):
iterator = iter(iterator)
for first in iterator: # Take the first element from the iterator
yield [first] + list(islice(iterator, chunk_size - 1))
yield [first, *list(islice(iterator, chunk_size - 1))]
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str: