From 800668300766225daebbeb8557090736e7bde7cb Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 2 Jun 2025 12:58:32 +0200 Subject: [PATCH] remove hf_vlm_model and add extra_generation_args Signed-off-by: Michele Dolfi --- .../datamodel/pipeline_options_vlm_model.py | 3 +- docling/datamodel/vlm_model_specs.py | 1 + docling/models/code_formula_model.py | 15 +++---- docling/models/document_picture_classifier.py | 15 +++---- docling/models/hf_vlm_model.py | 28 ------------- docling/models/layout_model.py | 15 +++---- .../models/picture_description_vlm_model.py | 27 +++---------- docling/models/table_structure_model.py | 15 +++---- docling/models/utils/__init__.py | 0 docling/models/utils/hf_model_download.py | 40 +++++++++++++++++++ .../hf_transformers_causallm_model.py | 14 ++++--- .../hf_transformers_vision2seq_model.py | 13 +++--- docling/models/vlm_models_inline/mlx_model.py | 12 +++--- docling/utils/model_downloader.py | 10 ++--- 14 files changed, 94 insertions(+), 114 deletions(-) delete mode 100644 docling/models/hf_vlm_model.py create mode 100644 docling/models/utils/__init__.py create mode 100644 docling/models/utils/hf_model_download.py diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py index 07185107..d4510318 100644 --- a/docling/datamodel/pipeline_options_vlm_model.py +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -46,7 +46,8 @@ class InlineVlmOptions(BaseVlmOptions): scale: float = 2.0 temperature: float = 0.0 - stop_strings: list[str] = [] + stop_strings: List[str] = [] + extra_generation_config: Dict[str, Any] = {} use_kv_cache: bool = True max_new_tokens: int = 4096 diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py index 663cbd73..85ad1b40 100644 --- a/docling/datamodel/vlm_model_specs.py +++ b/docling/datamodel/vlm_model_specs.py @@ -97,6 +97,7 @@ PHI4_TRANSFORMERS = InlineVlmOptions( supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA], scale=2.0, temperature=0.0, + extra_generation_config=dict(num_logits_to_keep=0), ) # Qwen diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py index eae0b82a..19a831ab 100644 --- a/docling/models/code_formula_model.py +++ b/docling/models/code_formula_model.py @@ -19,6 +19,7 @@ from pydantic import BaseModel from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import ItemAndImageEnrichmentElement from docling.models.base_model import BaseItemAndImageEnrichmentModel +from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device @@ -117,20 +118,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): force: bool = False, progress: bool = False, ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( + return download_hf_model( repo_id="ds4sd/CodeFormula", - force_download=force, - local_dir=local_dir, revision="v1.0.2", + local_dir=local_dir, + force=force, + progress=progress, ) - return Path(download_path) - def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: """ Determines if a given element in a document can be processed by the model. diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index 10216306..73a30203 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -15,6 +15,7 @@ from pydantic import BaseModel from docling.datamodel.accelerator_options import AcceleratorOptions from docling.models.base_model import BaseEnrichmentModel +from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device @@ -105,20 +106,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel): def download_models( local_dir: Optional[Path] = None, force: bool = False, progress: bool = False ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( + return download_hf_model( repo_id="ds4sd/DocumentFigureClassifier", - force_download=force, - local_dir=local_dir, revision="v1.0.1", + local_dir=local_dir, + force=force, + progress=progress, ) - return Path(download_path) - def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: """ Determines if the given element can be processed by the classifier. diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py deleted file mode 100644 index 20a1c7dd..00000000 --- a/docling/models/hf_vlm_model.py +++ /dev/null @@ -1,28 +0,0 @@ -import logging -from pathlib import Path -from typing import Optional - -_log = logging.getLogger(__name__) - - -class HuggingFaceVlmModel: - @staticmethod - def download_models( - repo_id: str, - local_dir: Optional[Path] = None, - force: bool = False, - progress: bool = False, - ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( - repo_id=repo_id, - force_download=force, - local_dir=local_dir, - # revision="v0.0.1", - ) - - return Path(download_path) diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 68c807c9..d8e9c032 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -15,6 +15,7 @@ from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction from docling.datamodel.document import ConversionResult from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel +from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device from docling.utils.layout_postprocessor import LayoutPostprocessor from docling.utils.profiling import TimeRecorder @@ -83,20 +84,14 @@ class LayoutModel(BasePageModel): force: bool = False, progress: bool = False, ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( + return download_hf_model( repo_id="ds4sd/docling-models", - force_download=force, + revision="v2.2.0", local_dir=local_dir, - revision="v2.1.0", + force=force, + progress=progress, ) - return Path(download_path) - def draw_clusters_and_cells_side_by_side( self, conv_res, page, clusters, mode_prefix: str, show: bool = False ): diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py index 5a79f1b6..230151d6 100644 --- a/docling/models/picture_description_vlm_model.py +++ b/docling/models/picture_description_vlm_model.py @@ -10,10 +10,15 @@ from docling.datamodel.pipeline_options import ( PictureDescriptionVlmOptions, ) from docling.models.picture_description_base_model import PictureDescriptionBaseModel +from docling.models.utils.hf_model_download import ( + HuggingFaceModelDownloadMixin, +) from docling.utils.accelerator_utils import decide_device -class PictureDescriptionVlmModel(PictureDescriptionBaseModel): +class PictureDescriptionVlmModel( + PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin +): @classmethod def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]: return PictureDescriptionVlmOptions @@ -66,26 +71,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): self.provenance = f"{self.options.repo_id}" - @staticmethod - def download_models( - repo_id: str, - local_dir: Optional[Path] = None, - force: bool = False, - progress: bool = False, - ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( - repo_id=repo_id, - force_download=force, - local_dir=local_dir, - ) - - return Path(download_path) - def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: from transformers import GenerationConfig diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 55a04afb..b90e85d5 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -22,6 +22,7 @@ from docling.datamodel.pipeline_options import ( ) from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel +from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder @@ -89,20 +90,14 @@ class TableStructureModel(BasePageModel): def download_models( local_dir: Optional[Path] = None, force: bool = False, progress: bool = False ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( + return download_hf_model( repo_id="ds4sd/docling-models", - force_download=force, - local_dir=local_dir, revision="v2.2.0", + local_dir=local_dir, + force=force, + progress=progress, ) - return Path(download_path) - def draw_table_and_cells( self, conv_res: ConversionResult, diff --git a/docling/models/utils/__init__.py b/docling/models/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docling/models/utils/hf_model_download.py b/docling/models/utils/hf_model_download.py new file mode 100644 index 00000000..3595166a --- /dev/null +++ b/docling/models/utils/hf_model_download.py @@ -0,0 +1,40 @@ +import logging +from pathlib import Path +from typing import Optional + +_log = logging.getLogger(__name__) + + +def download_hf_model( + repo_id: str, + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + revision: Optional[str] = None, +) -> Path: + from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars + + if not progress: + disable_progress_bars() + download_path = snapshot_download( + repo_id=repo_id, + force_download=force, + local_dir=local_dir, + revision=revision, + ) + + return Path(download_path) + + +class HuggingFaceModelDownloadMixin: + @staticmethod + def download_models( + repo_id: str, + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + ) -> Path: + return download_hf_model( + repo_id=repo_id, local_dir=local_dir, force=force, progress=progress + ) diff --git a/docling/models/vlm_models_inline/hf_transformers_causallm_model.py b/docling/models/vlm_models_inline/hf_transformers_causallm_model.py index d3d6a93d..aef23b79 100644 --- a/docling/models/vlm_models_inline/hf_transformers_causallm_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_causallm_model.py @@ -12,14 +12,18 @@ from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions from docling.models.base_model import BasePageModel -from docling.models.hf_vlm_model import HuggingFaceVlmModel +from docling.models.utils.hf_model_download import ( + HuggingFaceModelDownloadMixin, +) from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) -class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): +class HuggingFaceVlmModel_AutoModelForCausalLM( + BasePageModel, HuggingFaceModelDownloadMixin +): def __init__( self, enabled: bool, @@ -62,9 +66,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): repo_cache_folder = vlm_options.repo_id.replace("/", "--") if artifacts_path is None: - artifacts_path = HuggingFaceVlmModel.download_models( - self.vlm_options.repo_id - ) + artifacts_path = self.download_models(self.vlm_options.repo_id) elif (artifacts_path / repo_cache_folder).exists(): artifacts_path = artifacts_path / repo_cache_folder @@ -128,7 +130,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): use_cache=self.use_cache, # Enables KV caching which can improve performance temperature=self.temperature, generation_config=self.generation_config, - num_logits_to_keep=1, + **self.vlm_options.extra_generation_config, ) generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :] diff --git a/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py b/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py index 6a54d241..f92b9292 100644 --- a/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py @@ -11,14 +11,18 @@ from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions from docling.models.base_model import BasePageModel -from docling.models.hf_vlm_model import HuggingFaceVlmModel +from docling.models.utils.hf_model_download import ( + HuggingFaceModelDownloadMixin, +) from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) -class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): +class HuggingFaceVlmModel_AutoModelForVision2Seq( + BasePageModel, HuggingFaceModelDownloadMixin +): def __init__( self, enabled: bool, @@ -52,10 +56,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): # PARAMETERS: if artifacts_path is None: - # artifacts_path = self.download_models(self.vlm_options.repo_id) - artifacts_path = HuggingFaceVlmModel.download_models( - self.vlm_options.repo_id - ) + artifacts_path = self.download_models(self.vlm_options.repo_id) elif (artifacts_path / repo_cache_folder).exists(): artifacts_path = artifacts_path / repo_cache_folder diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py index 0949665e..d8b90407 100644 --- a/docling/models/vlm_models_inline/mlx_model.py +++ b/docling/models/vlm_models_inline/mlx_model.py @@ -11,13 +11,15 @@ from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToke from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions from docling.models.base_model import BasePageModel -from docling.models.hf_vlm_model import HuggingFaceVlmModel +from docling.models.utils.hf_model_download import ( + HuggingFaceModelDownloadMixin, +) from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) -class HuggingFaceMlxModel(BasePageModel): +class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin): def __init__( self, enabled: bool, @@ -48,12 +50,8 @@ class HuggingFaceMlxModel(BasePageModel): # PARAMETERS: if artifacts_path is None: - _log.debug( - f"before HuggingFaceVlmModel.download_models: {self.vlm_options.repo_id}" - ) - artifacts_path = HuggingFaceVlmModel.download_models( + artifacts_path = self.download_models( self.vlm_options.repo_id, - progress=True, ) elif (artifacts_path / repo_cache_folder).exists(): artifacts_path = artifacts_path / repo_cache_folder diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py index f8237fbc..55383c03 100644 --- a/docling/utils/model_downloader.py +++ b/docling/utils/model_downloader.py @@ -14,10 +14,10 @@ from docling.datamodel.vlm_model_specs import ( from docling.models.code_formula_model import CodeFormulaModel from docling.models.document_picture_classifier import DocumentPictureClassifier from docling.models.easyocr_model import EasyOcrModel -from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.models.layout_model import LayoutModel from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel from docling.models.table_structure_model import TableStructureModel +from docling.models.utils.hf_model_download import download_hf_model _log = logging.getLogger(__name__) @@ -77,7 +77,7 @@ def download_models( if with_smolvlm: _log.info("Downloading SmolVlm model...") - PictureDescriptionVlmModel.download_models( + download_hf_model( repo_id=smolvlm_picture_description.repo_id, local_dir=output_dir / smolvlm_picture_description.repo_cache_folder, force=force, @@ -86,7 +86,7 @@ def download_models( if with_smoldocling: _log.info("Downloading SmolDocling model...") - HuggingFaceVlmModel.download_models( + download_hf_model( repo_id=SMOLDOCLING_TRANSFORMERS.repo_id, local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder, force=force, @@ -95,7 +95,7 @@ def download_models( if with_smoldocling_mlx: _log.info("Downloading SmolDocling MLX model...") - HuggingFaceVlmModel.download_models( + download_hf_model( repo_id=SMOLDOCLING_MLX.repo_id, local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder, force=force, @@ -104,7 +104,7 @@ def download_models( if with_granite_vision: _log.info("Downloading Granite Vision model...") - PictureDescriptionVlmModel.download_models( + download_hf_model( repo_id=granite_picture_description.repo_id, local_dir=output_dir / granite_picture_description.repo_cache_folder, force=force,