From f63312add6981c8f8cf4c0efc8cc76735cb744b0 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Sun, 1 Jun 2025 17:55:16 +0200 Subject: [PATCH] use lowercase and uppercase only Signed-off-by: Michele Dolfi --- docling/datamodel/pipeline_options.py | 7 +- docling/datamodel/pipeline_vlm_model_spec.py | 8 +- ...vlm_model_LlavaForConditionalGeneration.py | 152 ------------------ .../__init__.py | 0 .../hf_transformers_causallm_model.py} | 0 .../hf_transformers_vision2seq_model.py} | 0 .../mlx_model.py} | 0 docling/pipeline/vlm_pipeline.py | 29 +--- 8 files changed, 13 insertions(+), 183 deletions(-) delete mode 100644 docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py rename docling/models/{hf_vlm_models => vlm_models_inline}/__init__.py (100%) rename docling/models/{hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py => vlm_models_inline/hf_transformers_causallm_model.py} (100%) rename docling/models/{hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py => vlm_models_inline/hf_transformers_vision2seq_model.py} (100%) rename docling/models/{hf_vlm_models/hf_vlm_mlx_model.py => vlm_models_inline/mlx_model.py} (100%) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 133a31fd..46123835 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -319,11 +319,8 @@ class ResponseFormat(str, Enum): class InferenceFramework(str, Enum): MLX = "mlx" TRANSFORMERS = "transformers" - TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq" - TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM" - TRANSFORMERS_LlavaForConditionalGeneration = ( - "transformers-LlavaForConditionalGeneration" - ) + TRANSFORMERS_VISION2SEQ = "transformers-vision2seq" + TRANSFORMERS_CAUSALLM = "transformers-causallm" class HuggingFaceVlmOptions(BaseVlmOptions): diff --git a/docling/datamodel/pipeline_vlm_model_spec.py b/docling/datamodel/pipeline_vlm_model_spec.py index 6547600d..71b2ebe5 100644 --- a/docling/datamodel/pipeline_vlm_model_spec.py +++ b/docling/datamodel/pipeline_vlm_model_spec.py @@ -29,7 +29,7 @@ SMOLDOCLING_TRANSFORMERS = HuggingFaceVlmOptions( repo_id="ds4sd/SmolDocling-256M-preview", prompt="Convert this page to docling.", response_format=ResponseFormat.DOCTAGS, - inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq, + inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ, scale=2.0, temperature=0.0, ) @@ -39,7 +39,7 @@ GRANITE_VISION_TRANSFORMERS = HuggingFaceVlmOptions( repo_id="ibm-granite/granite-vision-3.2-2b", prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", response_format=ResponseFormat.MARKDOWN, - inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq, + inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ, scale=2.0, temperature=0.0, ) @@ -59,7 +59,7 @@ PIXTRAL_12B_TRANSFORMERS = HuggingFaceVlmOptions( repo_id="mistral-community/pixtral-12b", prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, - inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq, + inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ, scale=2.0, temperature=0.0, ) @@ -78,7 +78,7 @@ PHI4_TRANSFORMERS = HuggingFaceVlmOptions( repo_id="microsoft/Phi-4-multimodal-instruct", prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown", response_format=ResponseFormat.MARKDOWN, - inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM, + inference_framework=InferenceFramework.TRANSFORMERS_CAUSALLM, scale=2.0, temperature=0.0, ) diff --git a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py deleted file mode 100644 index cd708b89..00000000 --- a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py +++ /dev/null @@ -1,152 +0,0 @@ -import logging -import time -from collections.abc import Iterable -from pathlib import Path -from typing import Optional - -from docling.datamodel.base_models import Page, VlmPrediction -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ( - AcceleratorOptions, - HuggingFaceVlmOptions, -) -from docling.models.base_model import BasePageModel -from docling.models.hf_vlm_model import HuggingFaceVlmModel -from docling.utils.accelerator_utils import decide_device -from docling.utils.profiling import TimeRecorder - -_log = logging.getLogger(__name__) - - -class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): - def __init__( - self, - enabled: bool, - artifacts_path: Optional[Path], - accelerator_options: AcceleratorOptions, - vlm_options: HuggingFaceVlmOptions, - ): - self.enabled = enabled - - self.trust_remote_code = True - - self.vlm_options = vlm_options - - if self.enabled: - from transformers import ( # type: ignore - AutoProcessor, - LlavaForConditionalGeneration, - ) - - self.device = decide_device(accelerator_options.device) - self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device) - - self.use_cache = vlm_options.use_kv_cache - self.max_new_tokens = vlm_options.max_new_tokens - self.temperature = vlm_options.temperature - - _log.debug(f"Available device for VLM: {self.device}") - repo_cache_folder = vlm_options.repo_id.replace("/", "--") - - if artifacts_path is None: - artifacts_path = HuggingFaceVlmModel.download_models( - self.vlm_options.repo_id - ) - elif (artifacts_path / repo_cache_folder).exists(): - artifacts_path = artifacts_path / repo_cache_folder - - self.processor = AutoProcessor.from_pretrained( - artifacts_path, - trust_remote_code=self.trust_remote_code, - ) - self.vlm_model = LlavaForConditionalGeneration.from_pretrained( - artifacts_path, - device_map=self.device, - # torch_dtype="auto", - # quantization_config=self.param_quantization_config, - _attn_implementation=( - "flash_attention_2" - if self.device.startswith("cuda") - and accelerator_options.cuda_use_flash_attention2 - else "eager" - ), - ).to(self.device) - - def __call__( - self, conv_res: ConversionResult, page_batch: Iterable[Page] - ) -> Iterable[Page]: - for page in page_batch: - assert page._backend is not None - if not page._backend.is_valid(): - yield page - else: - with TimeRecorder(conv_res, "vlm"): - assert page.size is not None - - hi_res_image = page.get_image(scale=2.0) # 144dpi - # hi_res_image = page.get_image(scale=1.0) # 72dpi - - if hi_res_image is not None: - im_width, im_height = hi_res_image.size - - """ - if hi_res_image: - if hi_res_image.mode != "RGB": - hi_res_image = hi_res_image.convert("RGB") - """ - - images = [hi_res_image] - - # Define prompt structure - prompt = self.formulate_prompt() - - inputs = self.processor( - text=prompt, images=images, return_tensors="pt" - ).to(self.device) - - # Generate response - start_time = time.time() - generate_ids = self.vlm_model.generate( - **inputs, - max_new_tokens=self.max_new_tokens, - use_cache=self.use_cache, # Enables KV caching which can improve performance - temperature=self.temperature, - ) - - # num_tokens = len(generate_ids[0]) - generation_time = time.time() - start_time - - response = self.processor.batch_decode( - generate_ids, - skip_special_tokens=True, - clean_up_tokenization_spaces=False, - )[0] - - page.predictions.vlm_response = VlmPrediction( - text=response, - # generated_tokens=num_tokens, - generation_time=generation_time, - ) - - yield page - - def formulate_prompt(self) -> str: - """Formulate a prompt for the VLM.""" - if self.vlm_options.repo_id == "mistral-community/pixtral-12b": - chat = [ - { - "role": "user", - "content": [ - {"type": "text", "content": self.vlm_options.prompt}, - {"type": "image"}, - ], - } - ] - prompt = self.processor.apply_chat_template(chat) - _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}") - - return prompt - else: - raise ValueError(f"No prompt template for {self.vlm_options.repo_id}") - - return "" diff --git a/docling/models/hf_vlm_models/__init__.py b/docling/models/vlm_models_inline/__init__.py similarity index 100% rename from docling/models/hf_vlm_models/__init__.py rename to docling/models/vlm_models_inline/__init__.py diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/vlm_models_inline/hf_transformers_causallm_model.py similarity index 100% rename from docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py rename to docling/models/vlm_models_inline/hf_transformers_causallm_model.py diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py b/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py similarity index 100% rename from docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py rename to docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py similarity index 100% rename from docling/models/hf_vlm_models/hf_vlm_mlx_model.py rename to docling/models/vlm_models_inline/mlx_model.py diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 0c6237e1..67723e2c 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -36,18 +36,15 @@ from docling.datamodel.pipeline_options import ( ) from docling.datamodel.settings import settings from docling.models.api_vlm_model import ApiVlmModel - -# from docling.models.hf_vlm_model import HuggingFaceVlmModel -from docling.models.hf_vlm_models.hf_vlm_mlx_model import HuggingFaceMlxModel -from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForCausalLM import ( +from docling.models.vlm_models_inline.hf_transformers_causallm_model import ( HuggingFaceVlmModel_AutoModelForCausalLM, ) -from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForVision2Seq import ( +from docling.models.vlm_models_inline.hf_transformers_vision2seq_model import ( HuggingFaceVlmModel_AutoModelForVision2Seq, ) -from docling.models.hf_vlm_models.hf_vlm_model_LlavaForConditionalGeneration import ( - HuggingFaceVlmModel_LlavaForConditionalGeneration, -) + +# from docling.models.hf_vlm_model import HuggingFaceVlmModel +from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel from docling.pipeline.base_pipeline import PaginatedPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -103,7 +100,7 @@ class VlmPipeline(PaginatedPipeline): ] elif ( vlm_options.inference_framework - == InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq + == InferenceFramework.TRANSFORMERS_VISION2SEQ ): self.build_pipe = [ HuggingFaceVlmModel_AutoModelForVision2Seq( @@ -115,7 +112,7 @@ class VlmPipeline(PaginatedPipeline): ] elif ( vlm_options.inference_framework - == InferenceFramework.TRANSFORMERS_AutoModelForCausalLM + == InferenceFramework.TRANSFORMERS_CAUSALLM ): self.build_pipe = [ HuggingFaceVlmModel_AutoModelForCausalLM( @@ -125,18 +122,6 @@ class VlmPipeline(PaginatedPipeline): vlm_options=vlm_options, ), ] - elif ( - vlm_options.inference_framework - == InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration - ): - self.build_pipe = [ - HuggingFaceVlmModel_LlavaForConditionalGeneration( - enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, - accelerator_options=pipeline_options.accelerator_options, - vlm_options=vlm_options, - ), - ] else: raise ValueError( f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"