mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-12 06:38:10 +00:00
ci: add coverage and ruff (#1383)
* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
import logging
|
||||
from typing import Any, Dict, Iterable, List, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Dict, List, Tuple, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||
|
||||
from docling.datamodel.document import ConversionResult, Page
|
||||
@@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
|
||||
def generate_multimodal_pages(
|
||||
doc_result: ConversionResult,
|
||||
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
||||
|
||||
label_to_doclaynet = {
|
||||
"title": "title",
|
||||
"table-of-contents": "document_index",
|
||||
@@ -122,7 +121,6 @@ def generate_multimodal_pages(
|
||||
if doc.main_text is None:
|
||||
return
|
||||
for ix, orig_item in enumerate(doc.main_text):
|
||||
|
||||
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
||||
if item is None or item.prov is None or len(item.prov) == 0:
|
||||
_log.debug(f"Skipping item {orig_item}")
|
||||
|
||||
@@ -29,7 +29,7 @@ def resolve_item(paths, obj):
|
||||
|
||||
try:
|
||||
key = int(paths[0])
|
||||
except:
|
||||
except Exception:
|
||||
key = paths[0]
|
||||
|
||||
if len(paths) == 1:
|
||||
@@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
||||
return unique_objects
|
||||
|
||||
|
||||
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
||||
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
|
||||
origin = DocumentOrigin(
|
||||
mimetype="application/pdf",
|
||||
filename=doc_glm["file-info"]["filename"],
|
||||
|
||||
@@ -18,7 +18,7 @@ class UnionFind:
|
||||
|
||||
def __init__(self, elements):
|
||||
self.parent = {elem: elem for elem in elements}
|
||||
self.rank = {elem: 0 for elem in elements}
|
||||
self.rank = dict.fromkeys(elements, 0)
|
||||
|
||||
def find(self, x):
|
||||
if self.parent[x] != x:
|
||||
@@ -484,7 +484,9 @@ class LayoutPostprocessor:
|
||||
spatial_index = (
|
||||
self.regular_index
|
||||
if cluster_type == "regular"
|
||||
else self.picture_index if cluster_type == "picture" else self.wrapper_index
|
||||
else self.picture_index
|
||||
if cluster_type == "picture"
|
||||
else self.wrapper_index
|
||||
)
|
||||
|
||||
# Map of currently valid clusters
|
||||
|
||||
@@ -37,7 +37,7 @@ def download_models(
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
if with_layout:
|
||||
_log.info(f"Downloading layout model...")
|
||||
_log.info("Downloading layout model...")
|
||||
LayoutModel.download_models(
|
||||
local_dir=output_dir / LayoutModel._model_repo_folder,
|
||||
force=force,
|
||||
@@ -45,7 +45,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_tableformer:
|
||||
_log.info(f"Downloading tableformer model...")
|
||||
_log.info("Downloading tableformer model...")
|
||||
TableStructureModel.download_models(
|
||||
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
||||
force=force,
|
||||
@@ -53,7 +53,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_picture_classifier:
|
||||
_log.info(f"Downloading picture classifier model...")
|
||||
_log.info("Downloading picture classifier model...")
|
||||
DocumentPictureClassifier.download_models(
|
||||
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
||||
force=force,
|
||||
@@ -61,7 +61,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_code_formula:
|
||||
_log.info(f"Downloading code formula model...")
|
||||
_log.info("Downloading code formula model...")
|
||||
CodeFormulaModel.download_models(
|
||||
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
||||
force=force,
|
||||
@@ -69,7 +69,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_smolvlm:
|
||||
_log.info(f"Downloading SmolVlm model...")
|
||||
_log.info("Downloading SmolVlm model...")
|
||||
PictureDescriptionVlmModel.download_models(
|
||||
repo_id=smolvlm_picture_description.repo_id,
|
||||
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
||||
@@ -78,7 +78,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_granite_vision:
|
||||
_log.info(f"Downloading Granite Vision model...")
|
||||
_log.info("Downloading Granite Vision model...")
|
||||
PictureDescriptionVlmModel.download_models(
|
||||
repo_id=granite_picture_description.repo_id,
|
||||
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
||||
@@ -87,7 +87,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_easyocr:
|
||||
_log.info(f"Downloading easyocr models...")
|
||||
_log.info("Downloading easyocr models...")
|
||||
EasyOcrModel.download_models(
|
||||
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
||||
force=force,
|
||||
|
||||
@@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
|
||||
if isinstance(iterator, List):
|
||||
iterator = iter(iterator)
|
||||
for first in iterator: # Take the first element from the iterator
|
||||
yield [first] + list(islice(iterator, chunk_size - 1))
|
||||
yield [first, *list(islice(iterator, chunk_size - 1))]
|
||||
|
||||
|
||||
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
||||
|
||||
Reference in New Issue
Block a user