refactoring the download_model

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-12-13 07:08:19 +00:00 · 2025-05-14 05:31:54 +02:00
parent 3407955a47
commit 4c0bc61e54
9 changed files with 64 additions and 221 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -269,7 +269,9 @@ class InferenceFramework(str, Enum):
    OPENAI = "openai"
    TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
    TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
-    TRANSFORMERS_LlavaForConditionalGeneration = "transformers-LlavaForConditionalGeneration"
+    TRANSFORMERS_LlavaForConditionalGeneration = (
+        "transformers-LlavaForConditionalGeneration"
+    )


 class HuggingFaceVlmOptions(BaseVlmOptions):
--- a/docling/models/hf_vlm_model.py
+++ b/docling/models/hf_vlm_model.py
@@ -17,81 +17,7 @@ from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)


-class HuggingFaceVlmModel(BasePageModel):
-    """
-    def __init__(
-        self,
-        enabled: bool,
-        artifacts_path: Optional[Path],
-        accelerator_options: AcceleratorOptions,
-        vlm_options: HuggingFaceVlmOptions,
-    ):
-        self.enabled = enabled
-
-        self.vlm_options = vlm_options
-
-        if self.enabled:
-            import torch
-            from transformers import (  # type: ignore
-                AutoModelForVision2Seq,
-                AutoProcessor,
-                BitsAndBytesConfig,
-            )
-
-            device = decide_device(accelerator_options.device)
-            self.device = device
-
-            _log.debug(f"Available device for HuggingFace VLM: {device}")
-
-            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
-
-            # PARAMETERS:
-            if artifacts_path is None:
-                artifacts_path = self.download_models(self.vlm_options.repo_id)
-            elif (artifacts_path / repo_cache_folder).exists():
-                artifacts_path = artifacts_path / repo_cache_folder
-
-            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
-            self.param_quantization_config = BitsAndBytesConfig(
-                load_in_8bit=vlm_options.load_in_8bit,  # True,
-                llm_int8_threshold=vlm_options.llm_int8_threshold,  # 6.0
-            )
-            self.param_quantized = vlm_options.quantized  # False
-
-            self.processor = AutoProcessor.from_pretrained(
-                artifacts_path,
-                # trust_remote_code=True,
-            )
-            if not self.param_quantized:
-                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
-                    artifacts_path,
-                    device_map=self.device,
-                    torch_dtype=torch.bfloat16,
-                    _attn_implementation=(
-                        "flash_attention_2"
-                        if self.device.startswith("cuda")
-                        and accelerator_options.cuda_use_flash_attention2
-                        else "eager"
-                    ),
-                    # trust_remote_code=True,
-                )  # .to(self.device)
-
-            else:
-                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
-                    artifacts_path,
-                    device_map=self.device,
-                    torch_dtype="auto",
-                    quantization_config=self.param_quantization_config,
-                    _attn_implementation=(
-                        "flash_attention_2"
-                        if self.device.startswith("cuda")
-                        and accelerator_options.cuda_use_flash_attention2
-                        else "eager"
-                    ),
-                    # trust_remote_code=True,
-                )  # .to(self.device)
-    """
-
+class HuggingFaceVlmModel:
    @staticmethod
    def download_models(
        repo_id: str,
@@ -112,80 +38,3 @@ class HuggingFaceVlmModel(BasePageModel):
        )

        return Path(download_path)
-
-    """
-    def __call__(
-        self, conv_res: ConversionResult, page_batch: Iterable[Page]
-    ) -> Iterable[Page]:
-        for page in page_batch:
-            assert page._backend is not None
-            if not page._backend.is_valid():
-                yield page
-            else:
-                with TimeRecorder(conv_res, "vlm"):
-                    assert page.size is not None
-
-                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
-                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
-
-                    if hi_res_image is not None:
-                        im_width, im_height = hi_res_image.size
-
-                    # populate page_tags with predicted doc tags
-                    page_tags = ""
-
-                    if hi_res_image:
-                        if hi_res_image.mode != "RGB":
-                            hi_res_image = hi_res_image.convert("RGB")
-
-                    messages = [
-                        {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "text",
-                                    "text": "This is a page from a document.",
-                                },
-                                {"type": "image"},
-                                {"type": "text", "text": self.param_question},
-                            ],
-                        }
-                    ]
-                    prompt = self.processor.apply_chat_template(
-                        messages, add_generation_prompt=False
-                    )
-                    inputs = self.processor(
-                        text=prompt, images=[hi_res_image], return_tensors="pt"
-                    )
-                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
-
-                    start_time = time.time()
-                    # Call model to generate:
-                    generated_ids = self.vlm_model.generate(
-                        **inputs, max_new_tokens=4096, use_cache=True
-                    )
-
-                    generation_time = time.time() - start_time
-                    generated_texts = self.processor.batch_decode(
-                        generated_ids[:, inputs["input_ids"].shape[1] :],
-                        skip_special_tokens=False,
-                    )[0]
-
-                    num_tokens = len(generated_ids[0])
-                    page_tags = generated_texts
-
-                    _log.debug(
-                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
-                    )
-
-                    # inference_time = time.time() - start_time
-                    # tokens_per_second = num_tokens / generation_time
-                    # print("")
-                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
-                    # print(f"Total tokens on page: {num_tokens:.2f}")
-                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
-                    # print("")
-                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
-
-                yield page
-    """
--- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
+++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
@@ -11,6 +11,7 @@ from docling.datamodel.pipeline_options import (
    HuggingFaceVlmOptions,
 )
 from docling.models.base_model import BasePageModel
+from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.utils.profiling import TimeRecorder

 _log = logging.getLogger(__name__)
@@ -44,7 +45,10 @@ class HuggingFaceMlxModel(BasePageModel):

            # PARAMETERS:
            if artifacts_path is None:
-                artifacts_path = self.download_models(self.vlm_options.repo_id)
+                # artifacts_path = self.download_models(self.vlm_options.repo_id)
+                artifacts_path = HuggingFaceVlmModel.download_models(
+                    self.vlm_options.repo_id
+                )
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder

@@ -54,6 +58,7 @@ class HuggingFaceMlxModel(BasePageModel):
            self.vlm_model, self.processor = load(artifacts_path)
            self.config = load_config(artifacts_path)

+    """
    @staticmethod
    def download_models(
        repo_id: str,
@@ -74,6 +79,7 @@ class HuggingFaceMlxModel(BasePageModel):
        )

        return Path(download_path)
+    """

    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
@@ -11,6 +11,7 @@ from docling.datamodel.pipeline_options import (
    HuggingFaceVlmOptions,
 )
 from docling.models.base_model import BasePageModel
+from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder

@@ -30,7 +31,6 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
        self.trust_remote_code = True

        self.vlm_options = vlm_options
-        print(self.vlm_options)

        if self.enabled:
            import torch
@@ -49,7 +49,10 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):

            # PARAMETERS:
            if artifacts_path is None:
-                artifacts_path = self.download_models(self.vlm_options.repo_id)
+                # artifacts_path = self.download_models(self.vlm_options.repo_id)
+                artifacts_path = HuggingFaceVlmModel.download_models(
+                    self.vlm_options.repo_id
+                )
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder

@@ -99,6 +102,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
            # Load generation config
            self.generation_config = GenerationConfig.from_pretrained(model_path)

+    """
    @staticmethod
    def download_models(
        repo_id: str,
@@ -119,6 +123,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
        )

        return Path(download_path)
+    """

    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
@@ -11,6 +11,7 @@ from docling.datamodel.pipeline_options import (
    HuggingFaceVlmOptions,
 )
 from docling.models.base_model import BasePageModel
+from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder

@@ -46,7 +47,10 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):

            # PARAMETERS:
            if artifacts_path is None:
-                artifacts_path = self.download_models(self.vlm_options.repo_id)
+                # artifacts_path = self.download_models(self.vlm_options.repo_id)
+                artifacts_path = HuggingFaceVlmModel.download_models(
+                    self.vlm_options.repo_id
+                )
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder

@@ -90,6 +94,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                    # trust_remote_code=True,
                )  # .to(self.device)

+    """
    @staticmethod
    def download_models(
        repo_id: str,
@@ -110,6 +115,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
        )

        return Path(download_path)
+    """

    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
--- a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
@@ -4,6 +4,8 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional

+from transformers import AutoProcessor, LlavaForConditionalGeneration
+
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
@@ -11,11 +13,10 @@ from docling.datamodel.pipeline_options import (
    HuggingFaceVlmOptions,
 )
 from docling.models.base_model import BasePageModel
+from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder

-from transformers import AutoProcessor, LlavaForConditionalGeneration
-
 _log = logging.getLogger(__name__)


@@ -32,13 +33,12 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
        self.trust_remote_code = True

        self.vlm_options = vlm_options
-        print(self.vlm_options)

        if self.enabled:
            import torch
            from transformers import (  # type: ignore
-                LlavaForConditionalGeneration,
                AutoProcessor,
+                LlavaForConditionalGeneration,
            )

            self.device = decide_device(accelerator_options.device)
@@ -51,22 +51,27 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):

            # PARAMETERS:
            if artifacts_path is None:
-                artifacts_path = self.download_models(self.vlm_options.repo_id)
+                # artifacts_path = self.download_models(self.vlm_options.repo_id)
+                artifacts_path = HuggingFaceVlmModel.download_models(
+                    self.vlm_options.repo_id
+                )
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder

            model_path = artifacts_path
            print(f"model: {model_path}")

-            self.max_new_tokens = 64 # FIXME
+            self.max_new_tokens = 64  # FIXME

            self.processor = AutoProcessor.from_pretrained(
                artifacts_path,
                trust_remote_code=self.trust_remote_code,
            )
-            self.vlm_model = LlavaForConditionalGeneration.from_pretrained(artifacts_path).to(self.device)
-
+            self.vlm_model = LlavaForConditionalGeneration.from_pretrained(
+                artifacts_path
+            ).to(self.device)

+    """
    @staticmethod
    def download_models(
        repo_id: str,
@@ -87,6 +92,7 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
        )

        return Path(download_path)
+    """

    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
@@ -109,20 +115,22 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
                        if hi_res_image.mode != "RGB":
                            hi_res_image = hi_res_image.convert("RGB")

-                    images = [
-                        hi_res_image
-                    ]
+                    images = [hi_res_image]
                    prompt = "<s>[INST]Describe the images.\n[IMG][/INST]"

-                    inputs = self.processor(text=prompt, images=images, return_tensors="pt", use_fast=False).to(self.device) #.to("cuda")
+                    inputs = self.processor(
+                        text=prompt, images=images, return_tensors="pt", use_fast=False
+                    ).to(self.device)  # .to("cuda")
                    generate_ids = self.vlm_model.generate(
                        **inputs,
                        max_new_tokens=self.max_new_tokens,
-                        use_cache=True  # Enables KV caching which can improve performance
+                        use_cache=True,  # Enables KV caching which can improve performance
                    )
-                    response = self.processor.batch_decode(generate_ids,
-                                                           skip_special_tokens=True,
-                                                           clean_up_tokenization_spaces=False)[0]
+                    response = self.processor.batch_decode(
+                        generate_ids,
+                        skip_special_tokens=True,
+                        clean_up_tokenization_spaces=False,
+                    )[0]
                    print(f"response: {response}")
                    """
                    _log.debug(
--- a/docling/models/hf_vlm_models/pixtral_12b_2409.py
+++ b/docling/models/hf_vlm_models/pixtral_12b_2409.py
@@ -1,33 +0,0 @@
-import logging
-import time
-from collections.abc import Iterable
-from pathlib import Path
-from typing import Optional
-
-from docling.datamodel.base_models import Page, VlmPrediction
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
-    HuggingFaceVlmOptions,
-)
-from docling.models.base_model import BasePageModel
-from docling.utils.accelerator_utils import decide_device
-from docling.utils.profiling import TimeRecorder
-
-_log = logging.getLogger(__name__)
-
-
-class HuggingFaceVlmModel_pixtral_12b_2409(BasePageModel):
-    def __init__(
-        self,
-        enabled: bool,
-        artifacts_path: Optional[Path],
-        accelerator_options: AcceleratorOptions,
-        vlm_options: HuggingFaceVlmOptions,
-    ):
-        self.enabled = enabled
-
-        self.vlm_options = vlm_options
-
-        if self.enabled:
-            import torch
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -24,18 +24,16 @@ from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel

 # from docling.models.hf_vlm_model import HuggingFaceVlmModel
-from docling.models.hf_vlm_models.hf_vlm_mlx_model import (
-    HuggingFaceMlxModel
-)
-from docling.models.hf_vlm_models.hf_vlm_model_LlavaForConditionalGeneration import (
-    HuggingFaceVlmModel_LlavaForConditionalGeneration
-)
+from docling.models.hf_vlm_models.hf_vlm_mlx_model import HuggingFaceMlxModel
 from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForCausalLM import (
    HuggingFaceVlmModel_AutoModelForCausalLM,
 )
 from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForVision2Seq import (
    HuggingFaceVlmModel_AutoModelForVision2Seq,
 )
+from docling.models.hf_vlm_models.hf_vlm_model_LlavaForConditionalGeneration import (
+    HuggingFaceVlmModel_LlavaForConditionalGeneration,
+)
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder

@@ -126,7 +124,9 @@ class VlmPipeline(PaginatedPipeline):
                    ),
                ]
            else:
-                raise ValueError(f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}")
+                raise ValueError(
+                    f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
+                )

        self.enrichment_pipe = [
            # Other models working on `NodeItem` elements in the DoclingDocument
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -50,10 +50,10 @@ vlm_conversion_options = pixtral_vlm_conversion_options
 """

 pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
-     repo_id="mistral-community/pixtral-12b",
-     prompt="OCR this image and export it in MarkDown.",
-     response_format=ResponseFormat.MARKDOWN,
-     inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
+    repo_id="mistral-community/pixtral-12b",
+    prompt="OCR this image and export it in MarkDown.",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
 )
 vlm_conversion_options = pixtral_vlm_conversion_options