ci: add coverage and ruff (#1383)

* add coverage calculation and push

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* new codecov version and usage of token

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* enable ruff formatter instead of black and isort

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply ruff lint fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply ruff unsafe fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add removed imports

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* runs 1 on linter issues

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* finalize linter fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Update pyproject.toml

Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Michele Dolfi
2025-04-14 18:01:26 +02:00
committed by GitHub
parent 293c28ca7c
commit 5458a88464
104 changed files with 665 additions and 633 deletions

View File

@@ -383,7 +383,7 @@
"\n",
"print(f\"Downloading {url}...\")\n",
"buf = BytesIO(requests.get(url).content)\n",
"print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
"print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
"with zipfile.ZipFile(buf) as zf:\n",
" res = zf.testzip()\n",
" if res:\n",
@@ -544,7 +544,7 @@
"source": [
"doc = backend.convert()\n",
"\n",
"claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
"claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
]
},

View File

@@ -1,8 +1,8 @@
import json
import logging
import time
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable
import yaml
from docling_core.types.doc import ImageRefMode
@@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)

View File

@@ -3,7 +3,6 @@ import logging
import time
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
@@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
_log = logging.getLogger(__name__)

View File

@@ -3,8 +3,8 @@
# It does not run the actual formula understanding model.
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
@@ -49,7 +49,6 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel
# How the pipeline can be extended.
class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
@@ -85,7 +84,7 @@ def main():
)
}
)
result = doc_converter.convert(input_doc_path)
doc_converter.convert(input_doc_path)
if __name__ == "__main__":

View File

@@ -3,8 +3,9 @@
# It does not run the actual picture classifier model.
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Any, Iterable
from typing import Any
from docling_core.types.doc import (
DoclingDocument,

View File

@@ -4,7 +4,7 @@ from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@@ -51,7 +51,6 @@ def main():
page_segments,
page,
) in generate_multimodal_pages(conv_res):
dpi = page._default_image_scale * 72
rows.append(
@@ -81,10 +80,10 @@ def main():
)
# Generate one parquet from all documents
df = pd.json_normalize(rows)
df_result = pd.json_normalize(rows)
now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename)
df_result.to_parquet(output_filename)
end_time = time.time() - start_time

View File

@@ -32,12 +32,12 @@ def main():
print(table_df.to_markdown())
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html(doc=conv_res.document))

View File

@@ -1,14 +1,9 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@@ -153,10 +153,10 @@
"source": [
"for i, chunk in enumerate(chunk_iter):\n",
" print(f\"=== {i} ===\")\n",
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
" print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
"\n",
" enriched_text = chunker.serialize(chunk=chunk)\n",
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
" print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
"\n",
" print()"
]
@@ -353,11 +353,11 @@
"for i, chunk in enumerate(chunks):\n",
" print(f\"=== {i} ===\")\n",
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
" print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
"\n",
" ser_txt = chunker.serialize(chunk=chunk)\n",
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
"\n",
" print()"
]

View File

@@ -2,17 +2,14 @@ import json
import time
from pathlib import Path
import yaml
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
VlmPipelineOptions,
granite_vision_vlm_conversion_options,
smoldocling_vlm_conversion_options,
smoldocling_vlm_mlx_conversion_options,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
@@ -39,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
## Alternative VLM models:
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
## Set up pipeline for PDF or image inputs
converter = DocumentConverter(
format_options={
@@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
for source in sources:
start_time = time.time()
print("================================================")
print("Processing... {}".format(source))
print(f"Processing... {source}")
print("================================================")
print("")
@@ -77,7 +71,7 @@ for source in sources:
print(page.predictions.vlm_response.text)
res.document.save_as_html(
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
filename=Path(f"{out_path}/{res.input.file.stem}.html"),
image_mode=ImageRefMode.REFERENCED,
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
)

View File

@@ -144,7 +144,7 @@
"for pic in doc.pictures[:5]:\n",
" html_item = (\n",
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
" )\n",
" for annotation in pic.annotations:\n",
@@ -252,7 +252,7 @@
"for pic in doc.pictures[:5]:\n",
" html_item = (\n",
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
" )\n",
" for annotation in pic.annotations:\n",

View File

@@ -283,7 +283,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -369,7 +369,7 @@
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
" try:\n",
" index_client.delete_index(index_name)\n",
" except:\n",
" except Exception:\n",
" pass\n",
"\n",
" index_client.create_or_update_index(new_index)\n",
@@ -487,7 +487,7 @@
"\n",
" all_succeeded = all(r.succeeded for r in resp)\n",
" console.print(\n",
" f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n",
" f\"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, \"\n",
" f\"first_doc_status_code: {resp[0].status_code}\"\n",
" )\n",
"\n",
@@ -807,10 +807,12 @@
}
],
"source": [
"from typing import Optional\n",
"\n",
"from azure.search.documents.models import VectorizableTextQuery\n",
"\n",
"\n",
"def generate_chat_response(prompt: str, system_message: str = None):\n",
"def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
" \"\"\"\n",
" Generates a single-turn chat response using Azure OpenAI Chat.\n",
" If you need multi-turn conversation or follow-up queries, you'll have to\n",

View File

@@ -351,7 +351,7 @@
"for source in sources:\n",
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
" print(f\"- text: {repr(doc_chunk.text)}\")\n",
" print(f\"- text: {doc_chunk.text!r}\")\n",
" if doc_chunk.meta.origin:\n",
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
" if doc_chunk.meta.headings:\n",

View File

@@ -341,7 +341,7 @@
"print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
"for i, doc in enumerate(resp_dict[\"context\"]):\n",
" print()\n",
" print(f\"Source {i+1}:\")\n",
" print(f\"Source {i + 1}:\")\n",
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
" for key in doc.metadata:\n",
" if key != \"pk\":\n",

View File

@@ -59,7 +59,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {
"collapsed": true,
"id": "u076oUSF_YUG"
@@ -72,12 +72,11 @@
"%pip install rich\n",
"%pip install torch\n",
"\n",
"import logging\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"import logging\n",
"\n",
"# Suppress Weaviate client logs\n",
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
]
@@ -119,7 +118,7 @@
" device = torch.device(\"mps\")\n",
" print(\"MPS GPU is enabled.\")\n",
"else:\n",
" raise EnvironmentError(\n",
" raise OSError(\n",
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
" )"
]
@@ -226,7 +225,6 @@
}
],
"source": [
"from docling.datamodel.document import ConversionResult\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"# Instantiate the doc converter\n",
@@ -345,7 +343,7 @@
"\n",
" openai_api_key = os.getenv(openai_api_key_var)\n",
" if not openai_api_key:\n",
" raise EnvironmentError(\n",
" raise OSError(\n",
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
" \"Please define it before running this script.\"\n",
" )"
@@ -387,7 +385,6 @@
"outputs": [],
"source": [
"import weaviate.classes.config as wc\n",
"from weaviate.classes.config import DataType, Property\n",
"\n",
"# Define the collection name\n",
"collection_name = \"docling\"\n",

View File

@@ -25,9 +25,7 @@ def main():
document = mdb.convert()
out_path = Path("scratch")
print(
f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
)
print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
# Export Docling document format to markdowndoc:
fn = os.path.basename(path)

View File

@@ -1,13 +1,10 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@@ -63,7 +63,7 @@ def main():
out_path = Path("scratch")
print(
f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}"
f"\nSaved markdown output to: {out_path!s}"
)
_log.debug(res.document._export_to_indented_text(max_text_len=16))
# Export Docling document format to markdowndoc:

View File

@@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@@ -2,9 +2,9 @@ import logging
import time
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
@@ -15,7 +15,6 @@ IMAGE_RESOLUTION_SCALE = 2.0
# FIXME: put in your favorite translation code ....
def translate(text: str, src: str = "en", dest: str = "de"):
_log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
# from googletrans import Translator
@@ -52,10 +51,9 @@ def main():
}
)
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
conv_doc = conv_res.document
doc_filename = conv_res.input.file
# Save markdown with embedded pictures in original text
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"

View File

@@ -432,7 +432,7 @@
"\n",
"for i, doc in enumerate(resp_dict[\"context\"][:]):\n",
" image_by_page = {}\n",
" print(f\"Source {i+1}:\")\n",
" print(f\"Source {i + 1}:\")\n",
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
" meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n",
"\n",

View File

@@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
ApiVlmOptions,
ResponseFormat,
VlmPipelineOptions,
granite_vision_vlm_ollama_conversion_options,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline