From f63312add6981c8f8cf4c0efc8cc76735cb744b0 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Sun, 1 Jun 2025 17:55:16 +0200
Subject: [PATCH] use lowercase and uppercase only

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py         |   7 +-
 docling/datamodel/pipeline_vlm_model_spec.py  |   8 +-
 ...vlm_model_LlavaForConditionalGeneration.py | 152 ------------------
 .../__init__.py                               |   0
 .../hf_transformers_causallm_model.py}        |   0
 .../hf_transformers_vision2seq_model.py}      |   0
 .../mlx_model.py}                             |   0
 docling/pipeline/vlm_pipeline.py              |  29 +---
 8 files changed, 13 insertions(+), 183 deletions(-)
 delete mode 100644 docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
 rename docling/models/{hf_vlm_models => vlm_models_inline}/__init__.py (100%)
 rename docling/models/{hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py => vlm_models_inline/hf_transformers_causallm_model.py} (100%)
 rename docling/models/{hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py => vlm_models_inline/hf_transformers_vision2seq_model.py} (100%)
 rename docling/models/{hf_vlm_models/hf_vlm_mlx_model.py => vlm_models_inline/mlx_model.py} (100%)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 133a31fd..46123835 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -319,11 +319,8 @@ class ResponseFormat(str, Enum):
 class InferenceFramework(str, Enum):
     MLX = "mlx"
     TRANSFORMERS = "transformers"
-    TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
-    TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
-    TRANSFORMERS_LlavaForConditionalGeneration = (
-        "transformers-LlavaForConditionalGeneration"
-    )
+    TRANSFORMERS_VISION2SEQ = "transformers-vision2seq"
+    TRANSFORMERS_CAUSALLM = "transformers-causallm"
 
 
 class HuggingFaceVlmOptions(BaseVlmOptions):
diff --git a/docling/datamodel/pipeline_vlm_model_spec.py b/docling/datamodel/pipeline_vlm_model_spec.py
index 6547600d..71b2ebe5 100644
--- a/docling/datamodel/pipeline_vlm_model_spec.py
+++ b/docling/datamodel/pipeline_vlm_model_spec.py
@@ -29,7 +29,7 @@ SMOLDOCLING_TRANSFORMERS = HuggingFaceVlmOptions(
     repo_id="ds4sd/SmolDocling-256M-preview",
     prompt="Convert this page to docling.",
     response_format=ResponseFormat.DOCTAGS,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
+    inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
     scale=2.0,
     temperature=0.0,
 )
@@ -39,7 +39,7 @@ GRANITE_VISION_TRANSFORMERS = HuggingFaceVlmOptions(
     repo_id="ibm-granite/granite-vision-3.2-2b",
     prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
     response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
+    inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
     scale=2.0,
     temperature=0.0,
 )
@@ -59,7 +59,7 @@ PIXTRAL_12B_TRANSFORMERS = HuggingFaceVlmOptions(
     repo_id="mistral-community/pixtral-12b",
     prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
     response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
+    inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
     scale=2.0,
     temperature=0.0,
 )
@@ -78,7 +78,7 @@ PHI4_TRANSFORMERS = HuggingFaceVlmOptions(
     repo_id="microsoft/Phi-4-multimodal-instruct",
     prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
     response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM,
+    inference_framework=InferenceFramework.TRANSFORMERS_CAUSALLM,
     scale=2.0,
     temperature=0.0,
 )
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
deleted file mode 100644
index cd708b89..00000000
--- a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import logging
-import time
-from collections.abc import Iterable
-from pathlib import Path
-from typing import Optional
-
-from docling.datamodel.base_models import Page, VlmPrediction
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
-    HuggingFaceVlmOptions,
-)
-from docling.models.base_model import BasePageModel
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
-from docling.utils.accelerator_utils import decide_device
-from docling.utils.profiling import TimeRecorder
-
-_log = logging.getLogger(__name__)
-
-
-class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
-    def __init__(
-        self,
-        enabled: bool,
-        artifacts_path: Optional[Path],
-        accelerator_options: AcceleratorOptions,
-        vlm_options: HuggingFaceVlmOptions,
-    ):
-        self.enabled = enabled
-
-        self.trust_remote_code = True
-
-        self.vlm_options = vlm_options
-
-        if self.enabled:
-            from transformers import (  # type: ignore
-                AutoProcessor,
-                LlavaForConditionalGeneration,
-            )
-
-            self.device = decide_device(accelerator_options.device)
-            self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
-
-            self.use_cache = vlm_options.use_kv_cache
-            self.max_new_tokens = vlm_options.max_new_tokens
-            self.temperature = vlm_options.temperature
-
-            _log.debug(f"Available device for VLM: {self.device}")
-            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
-
-            if artifacts_path is None:
-                artifacts_path = HuggingFaceVlmModel.download_models(
-                    self.vlm_options.repo_id
-                )
-            elif (artifacts_path / repo_cache_folder).exists():
-                artifacts_path = artifacts_path / repo_cache_folder
-
-            self.processor = AutoProcessor.from_pretrained(
-                artifacts_path,
-                trust_remote_code=self.trust_remote_code,
-            )
-            self.vlm_model = LlavaForConditionalGeneration.from_pretrained(
-                artifacts_path,
-                device_map=self.device,
-                # torch_dtype="auto",
-                # quantization_config=self.param_quantization_config,
-                _attn_implementation=(
-                    "flash_attention_2"
-                    if self.device.startswith("cuda")
-                    and accelerator_options.cuda_use_flash_attention2
-                    else "eager"
-                ),
-            ).to(self.device)
-
-    def __call__(
-        self, conv_res: ConversionResult, page_batch: Iterable[Page]
-    ) -> Iterable[Page]:
-        for page in page_batch:
-            assert page._backend is not None
-            if not page._backend.is_valid():
-                yield page
-            else:
-                with TimeRecorder(conv_res, "vlm"):
-                    assert page.size is not None
-
-                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
-                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
-
-                    if hi_res_image is not None:
-                        im_width, im_height = hi_res_image.size
-
-                    """
-                    if hi_res_image:
-                        if hi_res_image.mode != "RGB":
-                            hi_res_image = hi_res_image.convert("RGB")
-                    """
-
-                    images = [hi_res_image]
-
-                    # Define prompt structure
-                    prompt = self.formulate_prompt()
-
-                    inputs = self.processor(
-                        text=prompt, images=images, return_tensors="pt"
-                    ).to(self.device)
-
-                    # Generate response
-                    start_time = time.time()
-                    generate_ids = self.vlm_model.generate(
-                        **inputs,
-                        max_new_tokens=self.max_new_tokens,
-                        use_cache=self.use_cache,  # Enables KV caching which can improve performance
-                        temperature=self.temperature,
-                    )
-
-                    # num_tokens = len(generate_ids[0])
-                    generation_time = time.time() - start_time
-
-                    response = self.processor.batch_decode(
-                        generate_ids,
-                        skip_special_tokens=True,
-                        clean_up_tokenization_spaces=False,
-                    )[0]
-
-                    page.predictions.vlm_response = VlmPrediction(
-                        text=response,
-                        # generated_tokens=num_tokens,
-                        generation_time=generation_time,
-                    )
-
-                yield page
-
-    def formulate_prompt(self) -> str:
-        """Formulate a prompt for the VLM."""
-        if self.vlm_options.repo_id == "mistral-community/pixtral-12b":
-            chat = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "content": self.vlm_options.prompt},
-                        {"type": "image"},
-                    ],
-                }
-            ]
-            prompt = self.processor.apply_chat_template(chat)
-            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
-
-            return prompt
-        else:
-            raise ValueError(f"No prompt template for {self.vlm_options.repo_id}")
-
-        return ""
diff --git a/docling/models/hf_vlm_models/__init__.py b/docling/models/vlm_models_inline/__init__.py
similarity index 100%
rename from docling/models/hf_vlm_models/__init__.py
rename to docling/models/vlm_models_inline/__init__.py
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/vlm_models_inline/hf_transformers_causallm_model.py
similarity index 100%
rename from docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
rename to docling/models/vlm_models_inline/hf_transformers_causallm_model.py
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py b/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py
similarity index 100%
rename from docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
rename to docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py
diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py
similarity index 100%
rename from docling/models/hf_vlm_models/hf_vlm_mlx_model.py
rename to docling/models/vlm_models_inline/mlx_model.py
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 0c6237e1..67723e2c 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -36,18 +36,15 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel
-
-# from docling.models.hf_vlm_model import HuggingFaceVlmModel
-from docling.models.hf_vlm_models.hf_vlm_mlx_model import HuggingFaceMlxModel
-from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForCausalLM import (
+from docling.models.vlm_models_inline.hf_transformers_causallm_model import (
     HuggingFaceVlmModel_AutoModelForCausalLM,
 )
-from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForVision2Seq import (
+from docling.models.vlm_models_inline.hf_transformers_vision2seq_model import (
     HuggingFaceVlmModel_AutoModelForVision2Seq,
 )
-from docling.models.hf_vlm_models.hf_vlm_model_LlavaForConditionalGeneration import (
-    HuggingFaceVlmModel_LlavaForConditionalGeneration,
-)
+
+# from docling.models.hf_vlm_model import HuggingFaceVlmModel
+from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 
@@ -103,7 +100,7 @@ class VlmPipeline(PaginatedPipeline):
                 ]
             elif (
                 vlm_options.inference_framework
-                == InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq
+                == InferenceFramework.TRANSFORMERS_VISION2SEQ
             ):
                 self.build_pipe = [
                     HuggingFaceVlmModel_AutoModelForVision2Seq(
@@ -115,7 +112,7 @@ class VlmPipeline(PaginatedPipeline):
                 ]
             elif (
                 vlm_options.inference_framework
-                == InferenceFramework.TRANSFORMERS_AutoModelForCausalLM
+                == InferenceFramework.TRANSFORMERS_CAUSALLM
             ):
                 self.build_pipe = [
                     HuggingFaceVlmModel_AutoModelForCausalLM(
@@ -125,18 +122,6 @@ class VlmPipeline(PaginatedPipeline):
                         vlm_options=vlm_options,
                     ),
                 ]
-            elif (
-                vlm_options.inference_framework
-                == InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration
-            ):
-                self.build_pipe = [
-                    HuggingFaceVlmModel_LlavaForConditionalGeneration(
-                        enabled=True,  # must be always enabled for this pipeline to make sense.
-                        artifacts_path=artifacts_path,
-                        accelerator_options=pipeline_options.accelerator_options,
-                        vlm_options=vlm_options,
-                    ),
-                ]
             else:
                 raise ValueError(
                     f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"