diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index c267a639..33940c45 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -16,6 +16,7 @@ from docling.datamodel import asr_model_specs
 
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import WHISPER_TINY as whisper_tiny
 from docling.datamodel.pipeline_options_asr_model import (
     InlineAsrOptions,
 )
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
index a2b31d9d..a38f0414 100644
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -6,6 +6,7 @@ from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options import LayoutOptions
 
 
 class BaseVlmOptions(BaseModel):
@@ -90,7 +91,7 @@ class ApiVlmOptions(BaseVlmOptions):
 
 
 class TwoStageVlmOptions(BaseVlmOptions):
-    kind: Literal["inline_model_options"] = "inline_two_stage_model_options"
+    kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
 
-    vlm_options: UnionInlineVlmOptions
+    vlm_options: InlineVlmOptions
     layout_options: LayoutOptions
diff --git a/docling/models/base_model.py b/docling/models/base_model.py
index b0a43f40..dd019216 100644
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@@ -3,6 +3,7 @@ from collections.abc import Iterable
 from typing import Generic, Optional, Protocol, Type
 
 from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
+from PIL import Image
 from typing_extensions import TypeVar
 
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
@@ -19,12 +20,25 @@ class BaseModelWithOptions(Protocol):
 
 
 class BasePageModel(ABC):
+    scale: float  # scale with which the page-image needs to be created (dpi = 72*scale)
+    max_size: int  # max size of width/height of page-image
+
     @abstractmethod
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         pass
 
+class BaseLayoutModel(BasePageModel):
+    @abstractmethod
+    def predict_on_page_image(self, *, page_image: Image.Image) -> list(Cluster):
+        pass    
+
+class BaseVlmModel(BasePageModel):
+    @abstractmethod
+    def predict_on_page_image(self, *, page_image: Image.Image, prompt: str) -> str:
+        pass
+
 
 EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
 
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index 9b90f4bd..58c9646e 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -7,6 +7,7 @@ from typing import Optional
 
 import numpy as np
 from docling_core.types.doc import DocItemLabel
+from docling_core.types.doc.page import TextCell
 from PIL import Image
 
 from docling.datamodel.accelerator_options import AcceleratorOptions
@@ -176,11 +177,11 @@ class LayoutModel(BasePageModel):
                         )
                         clusters.append(cluster)
                     """
-                    clusters = self.predict_on_page(page_image)
-                    
+                    predicted_clusters = self.predict_on_page(page_image=page_image)
+
                     if settings.debug.visualize_raw_layout:
                         self.draw_clusters_and_cells_side_by_side(
-                            conv_res, page, clusters, mode_prefix="raw"
+                            conv_res, page, predicted_clusters, mode_prefix="raw"
                         )
 
                     # Apply postprocessing
@@ -212,8 +213,28 @@ class LayoutModel(BasePageModel):
                         clusters=processed_clusters
                     )
                     """
-                    page, processed_clusters, processed_cells = self.postprocess_on_page(page, cluster)
-                    
+                    page, processed_clusters, processed_cells = (
+                        self.postprocess_on_page(page=page, clusters=predicted_clusters)
+                    )
+
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings(
+                            "ignore",
+                            "Mean of empty slice|invalid value encountered in scalar divide",
+                            RuntimeWarning,
+                            "numpy",
+                        )
+
+                        conv_res.confidence.pages[page.page_no].layout_score = float(
+                            np.mean([c.confidence for c in processed_clusters])
+                        )
+
+                        conv_res.confidence.pages[page.page_no].ocr_score = float(
+                            np.mean(
+                                [c.confidence for c in processed_cells if c.from_ocr]
+                            )
+                        )
+
                 if settings.debug.visualize_layout:
                     self.draw_clusters_and_cells_side_by_side(
                         conv_res, page, processed_clusters, mode_prefix="postprocessed"
@@ -221,17 +242,13 @@ class LayoutModel(BasePageModel):
 
                 yield page
 
-    def predict_on_page(self, page_image: Image) -> list[Cluster]:
-
+    def predict_on_page(self, *, page_image: Image.Image) -> list[Cluster]:
         pred_items = self.layout_predictor.predict(page_image)
 
         clusters = []
         for ix, pred_item in enumerate(pred_items):
             label = DocItemLabel(
-                pred_item["label"]
-                .lower()
-                .replace(" ", "_")
-                .replace("-", "_")
+                pred_item["label"].lower().replace(" ", "_").replace("-", "_")
             )  # Temporary, until docling-ibm-model uses docling-core types
             cluster = Cluster(
                 id=ix,
@@ -241,36 +258,17 @@ class LayoutModel(BasePageModel):
                 cells=[],
             )
             clusters.append(cluster)
-            
+
         return clusters
 
-    def postprocess_on_page(self, page: Page, cluster: list(Cluster)):
-
+    def postprocess_on_page(
+        self, *, page: Page, clusters: list[Cluster]
+    ) -> tuple[Page, list[Cluster], list[TextCell]]:
         processed_clusters, processed_cells = LayoutPostprocessor(
             page, clusters, self.options
         ).postprocess()
         # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
-        
-        with warnings.catch_warnings():
-            warnings.filterwarnings(
-                "ignore",
-                "Mean of empty slice|invalid value encountered in scalar divide",
-                RuntimeWarning,
-                "numpy",
-            )
 
-            conv_res.confidence.pages[page.page_no].layout_score = float(
-                np.mean([c.confidence for c in processed_clusters])
-            )
-            
-            conv_res.confidence.pages[page.page_no].ocr_score = float(
-                np.mean(
-                    [c.confidence for c in processed_cells if c.from_ocr]
-                )
-            )
-
-            page.predictions.layout = LayoutPrediction(
-                clusters=processed_clusters
-            )
+        page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
 
         return page, processed_clusters, processed_cells
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
index d84925dd..3513de3e 100644
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -15,7 +15,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
     TransformersModelType,
     TransformersPromptStyle,
 )
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BasePageModel, BaseVlmModel
 from docling.models.utils.hf_model_download import (
     HuggingFaceModelDownloadMixin,
 )
@@ -25,7 +25,7 @@ from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 
 
-class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
+class HuggingFaceTransformersVlmModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         enabled: bool,
@@ -37,6 +37,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
 
         self.vlm_options = vlm_options
 
+        self.scale = self.vlm_options.scale
+        self.max_size = self.vlm_options.max_size
+
         if self.enabled:
             import torch
             from transformers import (
diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py
index 647ce531..ddeea379 100644
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@@ -10,7 +10,7 @@ from docling.datamodel.accelerator_options import (
 from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BasePageModel, BaseVlmModel
 from docling.models.utils.hf_model_download import (
     HuggingFaceModelDownloadMixin,
 )
@@ -19,7 +19,7 @@ from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 
 
-class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
+class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         enabled: bool,
@@ -28,10 +28,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
         vlm_options: InlineVlmOptions,
     ):
         self.enabled = enabled
-
         self.vlm_options = vlm_options
+
         self.max_tokens = vlm_options.max_new_tokens
         self.temperature = vlm_options.temperature
+        self.scale = self.vlm_options.scale
+        self.max_size = self.vlm_options.max_size
 
         if self.enabled:
             try:
diff --git a/docling/models/vlm_models_inline/TwoStageVlmModel.py b/docling/models/vlm_models_inline/two_stage_vlm_model.py
similarity index 82%
rename from docling/models/vlm_models_inline/TwoStageVlmModel.py
rename to docling/models/vlm_models_inline/two_stage_vlm_model.py
index f540ea4b..b31d46a9 100644
--- a/docling/models/vlm_models_inline/TwoStageVlmModel.py
+++ b/docling/models/vlm_models_inline/two_stage_vlm_model.py
@@ -15,7 +15,8 @@ from docling.datamodel.pipeline_options_vlm_model import (
     TransformersModelType,
     TransformersPromptStyle,
 )
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BasePageModel, BaseVlmModel
+from docling.models.layout_model import LayoutModel
 from docling.models.utils.hf_model_download import (
     HuggingFaceModelDownloadMixin,
 )
@@ -29,11 +30,11 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         *,
-        layout_model: LayoutModelModel,
-        vlm_model: BasePageModel,
+        layout_model: LayoutModel,
+        vlm_model: BaseVlmModel,
     ):
-        self.layout_model = layout_options
-        self.vlm_model = vlm_options
+        self.layout_model = layout_model
+        self.vlm_model = vlm_model
 
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
@@ -47,23 +48,27 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
                     assert page.size is not None
 
                     page_image = page.get_image(
-                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                        scale=self.vlm_model.scale, max_size=self.vlm_model.max_size
+                    )
+
+                    pred_clusters = self.layout_model.predict_on_page(page_image=page_image)
+                    page, processed_clusters, processed_cells = (
+                        self.layout_model.postprocess_on_page(
+                            page=page, clusters=pred_clusters
+                        )
                     )
 
-                    pred_clusters = self.layout_model.predict_on_page(page_image)
-                    page, processed_clusters, processed_cells = self.layout_model.postprocess_on_page(page=page,
-                                                                                                      page_image=page_image)
-                    
                     # Define prompt structure
                     if callable(self.vlm_options.prompt):
                         user_prompt = self.vlm_options.prompt(page.parsed_page)
                     else:
                         user_prompt = self.vlm_options.prompt
-                        
+
                     prompt = self.formulate_prompt(user_prompt, processed_clusters)
 
-                    generated_text, generation_time = self.vlm_model.predict_on_image(page_image=page_image,
-                                                                                      prompt=prompt)
+                    generated_text, generation_time = self.vlm_model.predict_on_image(
+                        page_image=page_image, prompt=prompt
+                    )
 
                     page.predictions.vlm_response = VlmPrediction(
                         text=generated_text,
@@ -72,7 +77,7 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
 
                 yield page
 
-    def formulate_prompt(self, user_prompt: str, clusters:list[Cluster]) -> str:
+    def formulate_prompt(self, user_prompt: str, clusters: list[Cluster]) -> str:
         """Formulate a prompt for the VLM."""
 
         if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 4c38e02a..01be3693 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -26,9 +26,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import (
-    VlmPipelineOptions,
-)
+from docling.datamodel.pipeline_options import TwoStageVlmOptions, VlmPipelineOptions
 from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
     InferenceFramework,
@@ -37,15 +35,17 @@ from docling.datamodel.pipeline_options_vlm_model import (
 )
 from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel
+from docling.models.layout_model import LayoutModel
 from docling.models.vlm_models_inline.hf_transformers_model import (
     HuggingFaceTransformersVlmModel,
 )
 from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
+from docling.models.vlm_models_inline.two_stage_vlm_model import (
+    TwoStageVlmModel,
+)
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 
-from docling.models.layout_model import LayoutModel
-
 _log = logging.getLogger(__name__)
 
 
@@ -110,7 +110,9 @@ class VlmPipeline(PaginatedPipeline):
                     f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
                 )
         elif isinstance(self.pipeline_options.vlm_options, TwoStageVlmOptions):
-            twostagevlm_options = cast(TwoStageVlmOptions, self.pipeline_options.vlm_options)
+            twostagevlm_options = cast(
+                TwoStageVlmOptions, self.pipeline_options.vlm_options
+            )
 
             layout_options = twostagevlm_options.lay_options
             vlm_options = twostagevlm_options.vlm_options
@@ -120,7 +122,7 @@ class VlmPipeline(PaginatedPipeline):
                 accelerator_options=pipeline_options.accelerator_options,
                 options=layout_options,
             )
-            
+
             if vlm_options.inference_framework == InferenceFramework.MLX:
                 vlm_model = HuggingFaceMlxModel(
                     enabled=True,  # must be always enabled for this pipeline to make sense.
@@ -145,7 +147,7 @@ class VlmPipeline(PaginatedPipeline):
                 raise ValueError(
                     f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
                 )
-            
+
         self.enrichment_pipe = [
             # Other models working on `NodeItem` elements in the DoclingDocument
         ]