ci: add coverage and ruff (#1383)

* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-12-12 06:38:10 +00:00 · 2025-04-14 18:01:26 +02:00
parent 293c28ca7c
commit 5458a88464
104 changed files with 665 additions and 633 deletions
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@@ -1,8 +1,8 @@
 import logging
-from typing import Any, Dict, Iterable, List, Tuple, Union
+from collections.abc import Iterable
+from typing import Any, Dict, List, Tuple, Union

 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import TextCell
 from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table

 from docling.datamodel.document import ConversionResult, Page
@@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
 def generate_multimodal_pages(
    doc_result: ConversionResult,
 ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
-
    label_to_doclaynet = {
        "title": "title",
        "table-of-contents": "document_index",
@@ -122,7 +121,6 @@ def generate_multimodal_pages(
    if doc.main_text is None:
        return
    for ix, orig_item in enumerate(doc.main_text):
-
        item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
        if item is None or item.prov is None or len(item.prov) == 0:
            _log.debug(f"Skipping item {orig_item}")
--- a/docling/utils/glm_utils.py
+++ b/docling/utils/glm_utils.py
@@ -29,7 +29,7 @@ def resolve_item(paths, obj):

    try:
        key = int(paths[0])
-    except:
+    except Exception:
        key = paths[0]

    if len(paths) == 1:
@@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
    return unique_objects


-def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
+def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:  # noqa: C901
    origin = DocumentOrigin(
        mimetype="application/pdf",
        filename=doc_glm["file-info"]["filename"],
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@@ -18,7 +18,7 @@ class UnionFind:

    def __init__(self, elements):
        self.parent = {elem: elem for elem in elements}
-        self.rank = {elem: 0 for elem in elements}
+        self.rank = dict.fromkeys(elements, 0)

    def find(self, x):
        if self.parent[x] != x:
@@ -484,7 +484,9 @@ class LayoutPostprocessor:
        spatial_index = (
            self.regular_index
            if cluster_type == "regular"
-            else self.picture_index if cluster_type == "picture" else self.wrapper_index
+            else self.picture_index
+            if cluster_type == "picture"
+            else self.wrapper_index
        )

        # Map of currently valid clusters
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@@ -37,7 +37,7 @@ def download_models(
    output_dir.mkdir(exist_ok=True, parents=True)

    if with_layout:
-        _log.info(f"Downloading layout model...")
+        _log.info("Downloading layout model...")
        LayoutModel.download_models(
            local_dir=output_dir / LayoutModel._model_repo_folder,
            force=force,
@@ -45,7 +45,7 @@ def download_models(
        )

    if with_tableformer:
-        _log.info(f"Downloading tableformer model...")
+        _log.info("Downloading tableformer model...")
        TableStructureModel.download_models(
            local_dir=output_dir / TableStructureModel._model_repo_folder,
            force=force,
@@ -53,7 +53,7 @@ def download_models(
        )

    if with_picture_classifier:
-        _log.info(f"Downloading picture classifier model...")
+        _log.info("Downloading picture classifier model...")
        DocumentPictureClassifier.download_models(
            local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
            force=force,
@@ -61,7 +61,7 @@ def download_models(
        )

    if with_code_formula:
-        _log.info(f"Downloading code formula model...")
+        _log.info("Downloading code formula model...")
        CodeFormulaModel.download_models(
            local_dir=output_dir / CodeFormulaModel._model_repo_folder,
            force=force,
@@ -69,7 +69,7 @@ def download_models(
        )

    if with_smolvlm:
-        _log.info(f"Downloading SmolVlm model...")
+        _log.info("Downloading SmolVlm model...")
        PictureDescriptionVlmModel.download_models(
            repo_id=smolvlm_picture_description.repo_id,
            local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
@@ -78,7 +78,7 @@ def download_models(
        )

    if with_granite_vision:
-        _log.info(f"Downloading Granite Vision model...")
+        _log.info("Downloading Granite Vision model...")
        PictureDescriptionVlmModel.download_models(
            repo_id=granite_picture_description.repo_id,
            local_dir=output_dir / granite_picture_description.repo_cache_folder,
@@ -87,7 +87,7 @@ def download_models(
        )

    if with_easyocr:
-        _log.info(f"Downloading easyocr models...")
+        _log.info("Downloading easyocr models...")
        EasyOcrModel.download_models(
            local_dir=output_dir / EasyOcrModel._model_repo_folder,
            force=force,
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
    if isinstance(iterator, List):
        iterator = iter(iterator)
    for first in iterator:  # Take the first element from the iterator
-        yield [first] + list(islice(iterator, chunk_size - 1))
+        yield [first, *list(islice(iterator, chunk_size - 1))]


 def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str: