ci: add coverage and ruff (#1383)

* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-12-08 20:58:11 +00:00 · 2025-04-14 18:01:26 +02:00
parent 293c28ca7c
commit 5458a88464
104 changed files with 665 additions and 633 deletions
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@@ -383,7 +383,7 @@
    "\n",
    "print(f\"Downloading {url}...\")\n",
    "buf = BytesIO(requests.get(url).content)\n",
-    "print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
+    "print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
    "with zipfile.ZipFile(buf) as zf:\n",
    "    res = zf.testzip()\n",
    "    if res:\n",
@@ -544,7 +544,7 @@
   "source": [
    "doc = backend.convert()\n",
    "\n",
-    "claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
+    "claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
    "print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
   ]
  },
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@@ -1,8 +1,8 @@
 import json
 import logging
 import time
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable

 import yaml
 from docling_core.types.doc import ImageRefMode
@@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption

 _log = logging.getLogger(__name__)
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@@ -3,7 +3,6 @@ import logging
 import time
 from pathlib import Path

-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
@@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling.models.ocr_mac_model import OcrMacOptions
-from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
-from docling.models.tesseract_ocr_model import TesseractOcrOptions

 _log = logging.getLogger(__name__)

--- a/docs/examples/develop_formula_understanding.py
+++ b/docs/examples/develop_formula_understanding.py
@@ -3,8 +3,8 @@
 # It does not run the actual formula understanding model.

 import logging
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable

 from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem

@@ -49,7 +49,6 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel

 # How the pipeline can be extended.
 class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
-
    def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
        super().__init__(pipeline_options)
        self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
@@ -85,7 +84,7 @@ def main():
            )
        }
    )
-    result = doc_converter.convert(input_doc_path)
+    doc_converter.convert(input_doc_path)


 if __name__ == "__main__":
--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@@ -3,8 +3,9 @@
 # It does not run the actual picture classifier model.

 import logging
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Any, Iterable
+from typing import Any

 from docling_core.types.doc import (
    DoclingDocument,
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@@ -4,7 +4,7 @@ from pathlib import Path

 from docling_core.types.doc import ImageRefMode, PictureItem, TableItem

-from docling.datamodel.base_models import FigureElement, InputFormat, Table
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption

--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@@ -51,7 +51,6 @@ def main():
        page_segments,
        page,
    ) in generate_multimodal_pages(conv_res):
-
        dpi = page._default_image_scale * 72

        rows.append(
@@ -81,10 +80,10 @@ def main():
        )

    # Generate one parquet from all documents
-    df = pd.json_normalize(rows)
+    df_result = pd.json_normalize(rows)
    now = datetime.datetime.now()
    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
-    df.to_parquet(output_filename)
+    df_result.to_parquet(output_filename)

    end_time = time.time() - start_time

--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@@ -32,12 +32,12 @@ def main():
        print(table_df.to_markdown())

        # Save the table as csv
-        element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
+        element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
        _log.info(f"Saving CSV table to {element_csv_filename}")
        table_df.to_csv(element_csv_filename)

        # Save the table as html
-        element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
+        element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
        _log.info(f"Saving HTML table to {element_html_filename}")
        with element_html_filename.open("w") as fp:
            fp.write(table.export_to_html(doc=conv_res.document))
--- a/docs/examples/full_page_ocr.py
+++ b/docs/examples/full_page_ocr.py
@@ -1,14 +1,9 @@
 from pathlib import Path

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
-    EasyOcrOptions,
-    OcrMacOptions,
    PdfPipelineOptions,
-    RapidOcrOptions,
    TesseractCliOcrOptions,
-    TesseractOcrOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption

--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@@ -153,10 +153,10 @@
   "source": [
    "for i, chunk in enumerate(chunk_iter):\n",
    "    print(f\"=== {i} ===\")\n",
-    "    print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
+    "    print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
    "\n",
    "    enriched_text = chunker.serialize(chunk=chunk)\n",
-    "    print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
+    "    print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
    "\n",
    "    print()"
   ]
@@ -353,11 +353,11 @@
    "for i, chunk in enumerate(chunks):\n",
    "    print(f\"=== {i} ===\")\n",
    "    txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
-    "    print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
+    "    print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
    "\n",
    "    ser_txt = chunker.serialize(chunk=chunk)\n",
    "    ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
-    "    print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
+    "    print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
    "\n",
    "    print()"
   ]
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -2,17 +2,14 @@ import json
 import time
 from pathlib import Path

-import yaml
+from docling_core.types.doc import DocItemLabel, ImageRefMode
+from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS

 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
    VlmPipelineOptions,
-    granite_vision_vlm_conversion_options,
-    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
 )
-from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

@@ -39,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
 ## Alternative VLM models:
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options

-from docling_core.types.doc import DocItemLabel, ImageRefMode
-from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
-
 ## Set up pipeline for PDF or image inputs
 converter = DocumentConverter(
    format_options={
@@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
 for source in sources:
    start_time = time.time()
    print("================================================")
-    print("Processing... {}".format(source))
+    print(f"Processing... {source}")
    print("================================================")
    print("")

@@ -77,7 +71,7 @@ for source in sources:
        print(page.predictions.vlm_response.text)

    res.document.save_as_html(
-        filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
+        filename=Path(f"{out_path}/{res.input.file.stem}.html"),
        image_mode=ImageRefMode.REFERENCED,
        labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
    )
--- a/docs/examples/pictures_description.ipynb
+++ b/docs/examples/pictures_description.ipynb
@@ -144,7 +144,7 @@
    "for pic in doc.pictures[:5]:\n",
    "    html_item = (\n",
    "        f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
-    "        f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
+    "        f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
    "        f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
    "    )\n",
    "    for annotation in pic.annotations:\n",
@@ -252,7 +252,7 @@
    "for pic in doc.pictures[:5]:\n",
    "    html_item = (\n",
    "        f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
-    "        f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
+    "        f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
    "        f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
    "    )\n",
    "    for annotation in pic.annotations:\n",
--- a/docs/examples/rag_azuresearch.ipynb
+++ b/docs/examples/rag_azuresearch.ipynb
@@ -283,7 +283,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@@ -369,7 +369,7 @@
    "    new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
    "    try:\n",
    "        index_client.delete_index(index_name)\n",
-    "    except:\n",
+    "    except Exception:\n",
    "        pass\n",
    "\n",
    "    index_client.create_or_update_index(new_index)\n",
@@ -487,7 +487,7 @@
    "\n",
    "    all_succeeded = all(r.succeeded for r in resp)\n",
    "    console.print(\n",
-    "        f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n",
+    "        f\"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, \"\n",
    "        f\"first_doc_status_code: {resp[0].status_code}\"\n",
    "    )\n",
    "\n",
@@ -807,10 +807,12 @@
    }
   ],
   "source": [
+    "from typing import Optional\n",
+    "\n",
    "from azure.search.documents.models import VectorizableTextQuery\n",
    "\n",
    "\n",
-    "def generate_chat_response(prompt: str, system_message: str = None):\n",
+    "def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
    "    \"\"\"\n",
    "    Generates a single-turn chat response using Azure OpenAI Chat.\n",
    "    If you need multi-turn conversation or follow-up queries, you'll have to\n",
--- a/docs/examples/rag_haystack.ipynb
+++ b/docs/examples/rag_haystack.ipynb
@@ -351,7 +351,7 @@
    "for source in sources:\n",
    "    if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
    "        doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
-    "        print(f\"- text: {repr(doc_chunk.text)}\")\n",
+    "        print(f\"- text: {doc_chunk.text!r}\")\n",
    "        if doc_chunk.meta.origin:\n",
    "            print(f\"  file: {doc_chunk.meta.origin.filename}\")\n",
    "        if doc_chunk.meta.headings:\n",
--- a/docs/examples/rag_langchain.ipynb
+++ b/docs/examples/rag_langchain.ipynb
@@ -341,7 +341,7 @@
    "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
    "for i, doc in enumerate(resp_dict[\"context\"]):\n",
    "    print()\n",
-    "    print(f\"Source {i+1}:\")\n",
+    "    print(f\"Source {i + 1}:\")\n",
    "    print(f\"  text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
    "    for key in doc.metadata:\n",
    "        if key != \"pk\":\n",
--- a/docs/examples/rag_weaviate.ipynb
+++ b/docs/examples/rag_weaviate.ipynb
@@ -59,7 +59,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "id": "u076oUSF_YUG"
@@ -72,12 +72,11 @@
    "%pip install rich\n",
    "%pip install torch\n",
    "\n",
+    "import logging\n",
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
-    "import logging\n",
-    "\n",
    "# Suppress Weaviate client logs\n",
    "logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
   ]
@@ -119,7 +118,7 @@
    "    device = torch.device(\"mps\")\n",
    "    print(\"MPS GPU is enabled.\")\n",
    "else:\n",
-    "    raise EnvironmentError(\n",
+    "    raise OSError(\n",
    "        \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
    "    )"
   ]
@@ -226,7 +225,6 @@
    }
   ],
   "source": [
-    "from docling.datamodel.document import ConversionResult\n",
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "# Instantiate the doc converter\n",
@@ -345,7 +343,7 @@
    "\n",
    "    openai_api_key = os.getenv(openai_api_key_var)\n",
    "    if not openai_api_key:\n",
-    "        raise EnvironmentError(\n",
+    "        raise OSError(\n",
    "            f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
    "            \"Please define it before running this script.\"\n",
    "        )"
@@ -387,7 +385,6 @@
   "outputs": [],
   "source": [
    "import weaviate.classes.config as wc\n",
-    "from weaviate.classes.config import DataType, Property\n",
    "\n",
    "# Define the collection name\n",
    "collection_name = \"docling\"\n",
--- a/docs/examples/run_md.py
+++ b/docs/examples/run_md.py
@@ -25,9 +25,7 @@ def main():
        document = mdb.convert()

        out_path = Path("scratch")
-        print(
-            f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
-        )
+        print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")

        # Export Docling document format to markdowndoc:
        fn = os.path.basename(path)
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@@ -1,13 +1,10 @@
 from pathlib import Path

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@@ -63,7 +63,7 @@ def main():
        out_path = Path("scratch")
        print(
            f"Document {res.input.file.name} converted."
-            f"\nSaved markdown output to: {str(out_path)}"
+            f"\nSaved markdown output to: {out_path!s}"
        )
        _log.debug(res.document._export_to_indented_text(max_text_len=16))
        # Export Docling document format to markdowndoc:
--- a/docs/examples/tesseract_lang_detection.py
+++ b/docs/examples/tesseract_lang_detection.py
@@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
-    TesseractOcrOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption

--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@@ -2,9 +2,9 @@ import logging
 import time
 from pathlib import Path

-from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
+from docling_core.types.doc import ImageRefMode, TableItem, TextItem

-from docling.datamodel.base_models import FigureElement, InputFormat, Table
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption

@@ -15,7 +15,6 @@ IMAGE_RESOLUTION_SCALE = 2.0

 # FIXME: put in your favorite translation code ....
 def translate(text: str, src: str = "en", dest: str = "de"):
-
    _log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
    # from googletrans import Translator

@@ -52,10 +51,9 @@ def main():
        }
    )

-    start_time = time.time()
-
    conv_res = doc_converter.convert(input_doc_path)
    conv_doc = conv_res.document
+    doc_filename = conv_res.input.file

    # Save markdown with embedded pictures in original text
    md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
--- a/docs/examples/visual_grounding.ipynb
+++ b/docs/examples/visual_grounding.ipynb
@@ -432,7 +432,7 @@
    "\n",
    "for i, doc in enumerate(resp_dict[\"context\"][:]):\n",
    "    image_by_page = {}\n",
-    "    print(f\"Source {i+1}:\")\n",
+    "    print(f\"Source {i + 1}:\")\n",
    "    print(f\"  text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
    "    meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n",
    "\n",
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
    ApiVlmOptions,
    ResponseFormat,
    VlmPipelineOptions,
-    granite_vision_vlm_ollama_conversion_options,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline