use lowercase and uppercase only

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-06-01 17:55:16 +02:00 · 2025-06-01 17:55:16 +02:00 · f63312add6
commit f63312add6
parent 8686842478
8 changed files with 13 additions and 183 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -319,11 +319,8 @@ class ResponseFormat(str, Enum):
 class InferenceFramework(str, Enum):
    MLX = "mlx"
    TRANSFORMERS = "transformers"
-    TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
+    TRANSFORMERS_VISION2SEQ = "transformers-vision2seq"
-    TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
+    TRANSFORMERS_CAUSALLM = "transformers-causallm"
    TRANSFORMERS_LlavaForConditionalGeneration = (
        "transformers-LlavaForConditionalGeneration"
    )
 class HuggingFaceVlmOptions(BaseVlmOptions):
--- a/docling/datamodel/pipeline_vlm_model_spec.py
+++ b/docling/datamodel/pipeline_vlm_model_spec.py
@ -29,7 +29,7 @@ SMOLDOCLING_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
+    inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
    scale=2.0,
    temperature=0.0,
 )
@ -39,7 +39,7 @@ GRANITE_VISION_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="ibm-granite/granite-vision-3.2-2b",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
+    inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
    scale=2.0,
    temperature=0.0,
 )
@ -59,7 +59,7 @@ PIXTRAL_12B_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="mistral-community/pixtral-12b",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
+    inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
    scale=2.0,
    temperature=0.0,
 )
@ -78,7 +78,7 @@ PHI4_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="microsoft/Phi-4-multimodal-instruct",
    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM,
+    inference_framework=InferenceFramework.TRANSFORMERS_CAUSALLM,
    scale=2.0,
    temperature=0.0,
 )
--- a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
@ -1,152 +0,0 @@
 import logging
 import time
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    HuggingFaceVlmOptions,
 )
 from docling.models.base_model import BasePageModel
 from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
    def __init__(
        self,
        enabled: bool,
        artifacts_path: Optional[Path],
        accelerator_options: AcceleratorOptions,
        vlm_options: HuggingFaceVlmOptions,
    ):
        self.enabled = enabled
        self.trust_remote_code = True
        self.vlm_options = vlm_options
        if self.enabled:
            from transformers import (  # type: ignore
                AutoProcessor,
                LlavaForConditionalGeneration,
            )
            self.device = decide_device(accelerator_options.device)
            self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
            self.use_cache = vlm_options.use_kv_cache
            self.max_new_tokens = vlm_options.max_new_tokens
            self.temperature = vlm_options.temperature
            _log.debug(f"Available device for VLM: {self.device}")
            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
            if artifacts_path is None:
                artifacts_path = HuggingFaceVlmModel.download_models(
                    self.vlm_options.repo_id
                )
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder
            self.processor = AutoProcessor.from_pretrained(
                artifacts_path,
                trust_remote_code=self.trust_remote_code,
            )
            self.vlm_model = LlavaForConditionalGeneration.from_pretrained(
                artifacts_path,
                device_map=self.device,
                # torch_dtype="auto",
                # quantization_config=self.param_quantization_config,
                _attn_implementation=(
                    "flash_attention_2"
                    if self.device.startswith("cuda")
                    and accelerator_options.cuda_use_flash_attention2
                    else "eager"
                ),
            ).to(self.device)
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "vlm"):
                    assert page.size is not None
                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size
                    """
                    if hi_res_image:
                        if hi_res_image.mode != "RGB":
                            hi_res_image = hi_res_image.convert("RGB")
                    """
                    images = [hi_res_image]
                    # Define prompt structure
                    prompt = self.formulate_prompt()
                    inputs = self.processor(
                        text=prompt, images=images, return_tensors="pt"
                    ).to(self.device)
                    # Generate response
                    start_time = time.time()
                    generate_ids = self.vlm_model.generate(
                        **inputs,
                        max_new_tokens=self.max_new_tokens,
                        use_cache=self.use_cache,  # Enables KV caching which can improve performance
                        temperature=self.temperature,
                    )
                    # num_tokens = len(generate_ids[0])
                    generation_time = time.time() - start_time
                    response = self.processor.batch_decode(
                        generate_ids,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=False,
                    )[0]
                    page.predictions.vlm_response = VlmPrediction(
                        text=response,
                        # generated_tokens=num_tokens,
                        generation_time=generation_time,
                    )
                yield page
    def formulate_prompt(self) -> str:
        """Formulate a prompt for the VLM."""
        if self.vlm_options.repo_id == "mistral-community/pixtral-12b":
            chat = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "content": self.vlm_options.prompt},
                        {"type": "image"},
                    ],
                }
            ]
            prompt = self.processor.apply_chat_template(chat)
            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
            return prompt
        else:
            raise ValueError(f"No prompt template for {self.vlm_options.repo_id}")
        return ""
--- a/docling/models/vlm_models_inline/init.py
+++ b/docling/models/vlm_models_inline/init.py
--- a/docling/models/vlm_models_inline/hf_transformers_causallm_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_causallm_model.py
--- a/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py
--- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
+++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -36,18 +36,15 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel
-
+from docling.models.vlm_models_inline.hf_transformers_causallm_model import (
 # from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.models.hf_vlm_models.hf_vlm_mlx_model import HuggingFaceMlxModel
 from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForCausalLM import (
    HuggingFaceVlmModel_AutoModelForCausalLM,
 )
-from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForVision2Seq import (
+from docling.models.vlm_models_inline.hf_transformers_vision2seq_model import (
    HuggingFaceVlmModel_AutoModelForVision2Seq,
 )
-from docling.models.hf_vlm_models.hf_vlm_model_LlavaForConditionalGeneration import (
+
-    HuggingFaceVlmModel_LlavaForConditionalGeneration,
+# from docling.models.hf_vlm_model import HuggingFaceVlmModel
-)
+from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -103,7 +100,7 @@ class VlmPipeline(PaginatedPipeline):
                ]
            elif (
                vlm_options.inference_framework
-                == InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq
+                == InferenceFramework.TRANSFORMERS_VISION2SEQ
            ):
                self.build_pipe = [
                    HuggingFaceVlmModel_AutoModelForVision2Seq(
@ -115,7 +112,7 @@ class VlmPipeline(PaginatedPipeline):
                ]
            elif (
                vlm_options.inference_framework
-                == InferenceFramework.TRANSFORMERS_AutoModelForCausalLM
+                == InferenceFramework.TRANSFORMERS_CAUSALLM
            ):
                self.build_pipe = [
                    HuggingFaceVlmModel_AutoModelForCausalLM(
@ -125,18 +122,6 @@ class VlmPipeline(PaginatedPipeline):
                        vlm_options=vlm_options,
                    ),
                ]
            elif (
                vlm_options.inference_framework
                == InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration
            ):
                self.build_pipe = [
                    HuggingFaceVlmModel_LlavaForConditionalGeneration(
                        enabled=True,  # must be always enabled for this pipeline to make sense.
                        artifacts_path=artifacts_path,
                        accelerator_options=pipeline_options.accelerator_options,
                        vlm_options=vlm_options,
                    ),
                ]
            else:
                raise ValueError(
                    f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"