use lowercase and uppercase only

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-25 19:44:34 +00:00 · 2025-06-01 17:55:16 +02:00 · 2025-06-01 17:55:16 +02:00 · f63312add6
commit f63312add6
parent 8686842478
8 changed files with 13 additions and 183 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -319,11 +319,8 @@ class ResponseFormat(str, Enum):
 class InferenceFramework(str, Enum):
    MLX = "mlx"
    TRANSFORMERS = "transformers"
-    TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
-    TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
-    TRANSFORMERS_LlavaForConditionalGeneration = (
-        "transformers-LlavaForConditionalGeneration"
-    )
+    TRANSFORMERS_VISION2SEQ = "transformers-vision2seq"
+    TRANSFORMERS_CAUSALLM = "transformers-causallm"


 class HuggingFaceVlmOptions(BaseVlmOptions):
--- a/docling/datamodel/pipeline_vlm_model_spec.py
+++ b/docling/datamodel/pipeline_vlm_model_spec.py
@ -29,7 +29,7 @@ SMOLDOCLING_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
+    inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
    scale=2.0,
    temperature=0.0,
 )
@ -39,7 +39,7 @@ GRANITE_VISION_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="ibm-granite/granite-vision-3.2-2b",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
+    inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
    scale=2.0,
    temperature=0.0,
 )
@ -59,7 +59,7 @@ PIXTRAL_12B_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="mistral-community/pixtral-12b",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
+    inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
    scale=2.0,
    temperature=0.0,
 )
@ -78,7 +78,7 @@ PHI4_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="microsoft/Phi-4-multimodal-instruct",
    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM,
+    inference_framework=InferenceFramework.TRANSFORMERS_CAUSALLM,
    scale=2.0,
    temperature=0.0,
 )
--- a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
@ -1,152 +0,0 @@
-import logging
-import time
-from collections.abc import Iterable
-from pathlib import Path
-from typing import Optional
-
-from docling.datamodel.base_models import Page, VlmPrediction
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
-    HuggingFaceVlmOptions,
-)
-from docling.models.base_model import BasePageModel
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
-from docling.utils.accelerator_utils import decide_device
-from docling.utils.profiling import TimeRecorder
-
-_log = logging.getLogger(__name__)
-
-
-class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
-    def __init__(
-        self,
-        enabled: bool,
-        artifacts_path: Optional[Path],
-        accelerator_options: AcceleratorOptions,
-        vlm_options: HuggingFaceVlmOptions,
-    ):
-        self.enabled = enabled
-
-        self.trust_remote_code = True
-
-        self.vlm_options = vlm_options
-
-        if self.enabled:
-            from transformers import (  # type: ignore
-                AutoProcessor,
-                LlavaForConditionalGeneration,
-            )
-
-            self.device = decide_device(accelerator_options.device)
-            self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
-
-            self.use_cache = vlm_options.use_kv_cache
-            self.max_new_tokens = vlm_options.max_new_tokens
-            self.temperature = vlm_options.temperature
-
-            _log.debug(f"Available device for VLM: {self.device}")
-            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
-
-            if artifacts_path is None:
-                artifacts_path = HuggingFaceVlmModel.download_models(
-                    self.vlm_options.repo_id
-                )
-            elif (artifacts_path / repo_cache_folder).exists():
-                artifacts_path = artifacts_path / repo_cache_folder
-
-            self.processor = AutoProcessor.from_pretrained(
-                artifacts_path,
-                trust_remote_code=self.trust_remote_code,
-            )
-            self.vlm_model = LlavaForConditionalGeneration.from_pretrained(
-                artifacts_path,
-                device_map=self.device,
-                # torch_dtype="auto",
-                # quantization_config=self.param_quantization_config,
-                _attn_implementation=(
-                    "flash_attention_2"
-                    if self.device.startswith("cuda")
-                    and accelerator_options.cuda_use_flash_attention2
-                    else "eager"
-                ),
-            ).to(self.device)
-
-    def __call__(
-        self, conv_res: ConversionResult, page_batch: Iterable[Page]
-    ) -> Iterable[Page]:
-        for page in page_batch:
-            assert page._backend is not None
-            if not page._backend.is_valid():
-                yield page
-            else:
-                with TimeRecorder(conv_res, "vlm"):
-                    assert page.size is not None
-
-                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
-                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
-
-                    if hi_res_image is not None:
-                        im_width, im_height = hi_res_image.size
-
-                    """
-                    if hi_res_image:
-                        if hi_res_image.mode != "RGB":
-                            hi_res_image = hi_res_image.convert("RGB")
-                    """
-
-                    images = [hi_res_image]
-
-                    # Define prompt structure
-                    prompt = self.formulate_prompt()
-
-                    inputs = self.processor(
-                        text=prompt, images=images, return_tensors="pt"
-                    ).to(self.device)
-
-                    # Generate response
-                    start_time = time.time()
-                    generate_ids = self.vlm_model.generate(
-                        **inputs,
-                        max_new_tokens=self.max_new_tokens,
-                        use_cache=self.use_cache,  # Enables KV caching which can improve performance
-                        temperature=self.temperature,
-                    )
-
-                    # num_tokens = len(generate_ids[0])
-                    generation_time = time.time() - start_time
-
-                    response = self.processor.batch_decode(
-                        generate_ids,
-                        skip_special_tokens=True,
-                        clean_up_tokenization_spaces=False,
-                    )[0]
-
-                    page.predictions.vlm_response = VlmPrediction(
-                        text=response,
-                        # generated_tokens=num_tokens,
-                        generation_time=generation_time,
-                    )
-
-                yield page
-
-    def formulate_prompt(self) -> str:
-        """Formulate a prompt for the VLM."""
-        if self.vlm_options.repo_id == "mistral-community/pixtral-12b":
-            chat = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "content": self.vlm_options.prompt},
-                        {"type": "image"},
-                    ],
-                }
-            ]
-            prompt = self.processor.apply_chat_template(chat)
-            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
-
-            return prompt
-        else:
-            raise ValueError(f"No prompt template for {self.vlm_options.repo_id}")
-
-        return ""
--- a/docling/models/vlm_models_inline/init.py
+++ b/docling/models/vlm_models_inline/init.py
--- a/docling/models/vlm_models_inline/hf_transformers_causallm_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_causallm_model.py
--- a/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py
--- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
+++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -36,18 +36,15 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel
-
-# from docling.models.hf_vlm_model import HuggingFaceVlmModel
-from docling.models.hf_vlm_models.hf_vlm_mlx_model import HuggingFaceMlxModel
-from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForCausalLM import (
+from docling.models.vlm_models_inline.hf_transformers_causallm_model import (
    HuggingFaceVlmModel_AutoModelForCausalLM,
 )
-from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForVision2Seq import (
+from docling.models.vlm_models_inline.hf_transformers_vision2seq_model import (
    HuggingFaceVlmModel_AutoModelForVision2Seq,
 )
-from docling.models.hf_vlm_models.hf_vlm_model_LlavaForConditionalGeneration import (
-    HuggingFaceVlmModel_LlavaForConditionalGeneration,
-)
+
+# from docling.models.hf_vlm_model import HuggingFaceVlmModel
+from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder

@ -103,7 +100,7 @@ class VlmPipeline(PaginatedPipeline):
                ]
            elif (
                vlm_options.inference_framework
-                == InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq
+                == InferenceFramework.TRANSFORMERS_VISION2SEQ
            ):
                self.build_pipe = [
                    HuggingFaceVlmModel_AutoModelForVision2Seq(
@ -115,7 +112,7 @@ class VlmPipeline(PaginatedPipeline):
                ]
            elif (
                vlm_options.inference_framework
-                == InferenceFramework.TRANSFORMERS_AutoModelForCausalLM
+                == InferenceFramework.TRANSFORMERS_CAUSALLM
            ):
                self.build_pipe = [
                    HuggingFaceVlmModel_AutoModelForCausalLM(
@ -125,18 +122,6 @@ class VlmPipeline(PaginatedPipeline):
                        vlm_options=vlm_options,
                    ),
                ]
-            elif (
-                vlm_options.inference_framework
-                == InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration
-            ):
-                self.build_pipe = [
-                    HuggingFaceVlmModel_LlavaForConditionalGeneration(
-                        enabled=True,  # must be always enabled for this pipeline to make sense.
-                        artifacts_path=artifacts_path,
-                        accelerator_options=pipeline_options.accelerator_options,
-                        vlm_options=vlm_options,
-                    ),
-                ]
            else:
                raise ValueError(
                    f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"