Merge f4c1836c96 into 98e2fcff63

2025-07-24 19:14:23 +00:00 · 2025-07-23 14:06:06 +02:00 · 2025-07-23 14:06:06 +02:00 · 0418d2887f
commit 0418d2887f
parent 98e2fcff63 f4c1836c96
15 changed files with 506 additions and 79 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -63,6 +63,7 @@ from docling.datamodel.vlm_model_specs import (
    GRANITE_VISION_TRANSFORMERS,
    SMOLDOCLING_MLX,
    SMOLDOCLING_TRANSFORMERS,
+    VLM2STAGE,
    VlmModelType,
 )
 from docling.document_converter import (
@ -627,6 +628,12 @@ def convert(  # noqa: C901
                            "To run SmolDocling faster, please install mlx-vlm:\n"
                            "pip install mlx-vlm"
                        )
+            elif vlm_model == VlmModelType.VLM2STAGE:
+                pipeline_options.vlm_options = VLM2STAGE
+            else:
+                raise ValueError(
+                    f"{vlm_model} is not of type GRANITE_VISION, GRANITE_VISION_OLLAMA, SMOLDOCLING_TRANSFORMERS or VLM2STAGE"
+                )

            pdf_format_option = PdfFormatOption(
                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
--- a/docling/datamodel/asr_model_specs.py
+++ b/docling/datamodel/asr_model_specs.py
@ -11,12 +11,13 @@ from docling.datamodel.pipeline_options_asr_model import (
    # ApiAsrOptions,
    InferenceAsrFramework,
    InlineAsrNativeWhisperOptions,
-    TransformersModelType,
+    InlineAsrOptions,
+    # TransformersModelType,
 )

 _log = logging.getLogger(__name__)

-WHISPER_TINY = InlineAsrNativeWhisperOptions(
+WHISPER_TINY: InlineAsrOptions = InlineAsrNativeWhisperOptions(
    repo_id="tiny",
    inference_framework=InferenceAsrFramework.WHISPER,
    verbose=True,
@ -27,7 +28,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
    max_time_chunk=30.0,
 )

-WHISPER_SMALL = InlineAsrNativeWhisperOptions(
+WHISPER_SMALL: InlineAsrOptions = InlineAsrNativeWhisperOptions(
    repo_id="small",
    inference_framework=InferenceAsrFramework.WHISPER,
    verbose=True,
@ -38,7 +39,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
    max_time_chunk=30.0,
 )

-WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
+WHISPER_MEDIUM: InlineAsrOptions = InlineAsrNativeWhisperOptions(
    repo_id="medium",
    inference_framework=InferenceAsrFramework.WHISPER,
    verbose=True,
@ -49,7 +50,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
    max_time_chunk=30.0,
 )

-WHISPER_BASE = InlineAsrNativeWhisperOptions(
+WHISPER_BASE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
    repo_id="base",
    inference_framework=InferenceAsrFramework.WHISPER,
    verbose=True,
@ -60,7 +61,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
    max_time_chunk=30.0,
 )

-WHISPER_LARGE = InlineAsrNativeWhisperOptions(
+WHISPER_LARGE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
    repo_id="large",
    inference_framework=InferenceAsrFramework.WHISPER,
    verbose=True,
@ -71,7 +72,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
    max_time_chunk=30.0,
 )

-WHISPER_TURBO = InlineAsrNativeWhisperOptions(
+WHISPER_TURBO: InlineAsrOptions = InlineAsrNativeWhisperOptions(
    repo_id="turbo",
    inference_framework=InferenceAsrFramework.WHISPER,
    verbose=True,
--- a/docling/datamodel/layout_model_specs.py
+++ b/docling/datamodel/layout_model_specs.py
@ -26,8 +26,6 @@ class LayoutModelConfig(BaseModel):
        return self.repo_id.replace("/", "--")


-# HuggingFace Layout Models
-
 # Default Docling Layout Model
 DOCLING_LAYOUT_V2 = LayoutModelConfig(
    name="docling_layout_v2",
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -12,10 +12,16 @@ from pydantic import (
 )
 from typing_extensions import deprecated

-from docling.datamodel import asr_model_specs
-
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import (
+    WHISPER_BASE,
+    WHISPER_LARGE,
+    WHISPER_MEDIUM,
+    WHISPER_SMALL,
+    WHISPER_TINY,
+    WHISPER_TURBO,
+)
 from docling.datamodel.layout_model_specs import (
    DOCLING_LAYOUT_EGRET_LARGE,
    DOCLING_LAYOUT_EGRET_MEDIUM,
@ -33,6 +39,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
    InferenceFramework,
    InlineVlmOptions,
    ResponseFormat,
+    TwoStageVlmOptions,
 )
 from docling.datamodel.vlm_model_specs import (
    GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
@ -270,8 +277,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
        False  # (To be used with vlms, or other generative models)
    )
    # If True, text from backend will be used instead of generated text
-    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions, TwoStageVlmOptions] = (
        smoldocling_vlm_conversion_options
+        # SMOLDOCLING_TRANSFORMERS
    )


@ -286,7 +294,7 @@ class LayoutOptions(BaseModel):


 class AsrPipelineOptions(PipelineOptions):
-    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
+    asr_options: Union[InlineAsrOptions] = WHISPER_TINY
    artifacts_path: Optional[Union[Path, str]] = None


--- a/docling/datamodel/pipeline_options_asr_model.py
+++ b/docling/datamodel/pipeline_options_asr_model.py
@ -5,10 +5,11 @@ from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated

 from docling.datamodel.accelerator_options import AcceleratorDevice
-from docling.datamodel.pipeline_options_vlm_model import (
-    # InferenceFramework,
-    TransformersModelType,
-)
+
+# from docling.datamodel.pipeline_options_vlm_model import (
+# InferenceFramework,
+# TransformersModelType,
+# )


 class BaseAsrOptions(BaseModel):
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@ -6,6 +6,9 @@ from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated

 from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.layout_model_specs import (
+    LayoutModelConfig,
+)


 class BaseVlmOptions(BaseModel):
@ -87,3 +90,12 @@ class ApiVlmOptions(BaseVlmOptions):
    timeout: float = 60
    concurrency: int = 1
    response_format: ResponseFormat
+
+
+class TwoStageVlmOptions(BaseModel):
+    kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
+
+    response_format: ResponseFormat  # final response of the VLM
+
+    layout_options: LayoutModelConfig  # = DOCLING_LAYOUT_V2
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions]  # = SMOLDOCLING_TRANSFORMERS
--- a/docling/datamodel/vlm_model_specs.py
+++ b/docling/datamodel/vlm_model_specs.py
@ -6,12 +6,17 @@ from pydantic import (
 )

 from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.layout_model_specs import (
+    DOCLING_LAYOUT_HERON,
+    DOCLING_LAYOUT_V2,
+)
 from docling.datamodel.pipeline_options_vlm_model import (
    ApiVlmOptions,
    InferenceFramework,
    InlineVlmOptions,
    ResponseFormat,
    TransformersModelType,
+    TwoStageVlmOptions,
 )

 _log = logging.getLogger(__name__)
@ -137,8 +142,15 @@ GEMMA3_27B_MLX = InlineVlmOptions(
    temperature=0.0,
 )

+VLM2STAGE = TwoStageVlmOptions(
+    vlm_options=SMOLDOCLING_MLX,
+    layout_options=DOCLING_LAYOUT_HERON,
+    response_format=SMOLDOCLING_MLX.response_format,
+)
+

 class VlmModelType(str, Enum):
    SMOLDOCLING = "smoldocling"
    GRANITE_VISION = "granite_vision"
    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
+    VLM2STAGE = "vlm2stage"
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@ -3,9 +3,16 @@ from collections.abc import Iterable
 from typing import Generic, Optional, Protocol, Type

 from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
+from PIL import Image
 from typing_extensions import TypeVar

-from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
+from docling.datamodel.base_models import (
+    Cluster,
+    ItemAndImageEnrichmentElement,
+    Page,
+    TextCell,
+    VlmPredictionToken,
+)
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import BaseOptions
 from docling.datamodel.settings import settings
@ -19,6 +26,9 @@ class BaseModelWithOptions(Protocol):


 class BasePageModel(ABC):
+    scale: float  # scale with which the page-image needs to be created (dpi = 72*scale)
+    max_size: int  # max size of width/height of page-image
+
    @abstractmethod
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
@ -26,6 +36,30 @@ class BasePageModel(ABC):
        pass


+class BaseLayoutModel(BasePageModel):
+    @abstractmethod
+    def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
+        pass
+
+    @abstractmethod
+    def postprocess_on_page_image(
+        self, *, page: Page, clusters: list[Cluster]
+    ) -> tuple[Page, list[Cluster], list[TextCell]]:
+        pass
+
+
+class BaseVlmModel(BasePageModel):
+    @abstractmethod
+    def get_user_prompt(self, page: Optional[Page]) -> str:
+        pass
+
+    @abstractmethod
+    def predict_on_page_image(
+        self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
+    ) -> tuple[str, Optional[list[VlmPredictionToken]]]:
+        pass
+
+
 EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)


--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -7,6 +7,7 @@ from typing import Optional

 import numpy as np
 from docling_core.types.doc import DocItemLabel
+from docling_core.types.doc.page import TextCell
 from PIL import Image

 from docling.datamodel.accelerator_options import AcceleratorOptions
@ -15,7 +16,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
 from docling.datamodel.pipeline_options import LayoutOptions
 from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BaseLayoutModel, BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
@ -25,7 +26,7 @@ from docling.utils.visualization import draw_clusters
 _log = logging.getLogger(__name__)


-class LayoutModel(BasePageModel):
+class LayoutModel(BaseLayoutModel):
    TEXT_ELEM_LABELS = [
        DocItemLabel.TEXT,
        DocItemLabel.FOOTNOTE,
@ -158,6 +159,7 @@ class LayoutModel(BasePageModel):
                    page_image = page.get_image(scale=1.0)
                    assert page_image is not None

+                    """
                    clusters = []
                    for ix, pred_item in enumerate(
                        self.layout_predictor.predict(page_image)
@ -176,14 +178,18 @@ class LayoutModel(BasePageModel):
                            cells=[],
                        )
                        clusters.append(cluster)
+                    """
+                    predicted_clusters = self.predict_on_page_image(
+                        page_image=page_image
+                    )

                    if settings.debug.visualize_raw_layout:
                        self.draw_clusters_and_cells_side_by_side(
-                            conv_res, page, clusters, mode_prefix="raw"
+                            conv_res, page, predicted_clusters, mode_prefix="raw"
                        )

                    # Apply postprocessing
-
+                    """
                    processed_clusters, processed_cells = LayoutPostprocessor(
                        page, clusters, self.options
                    ).postprocess()
@ -210,6 +216,30 @@ class LayoutModel(BasePageModel):
                    page.predictions.layout = LayoutPrediction(
                        clusters=processed_clusters
                    )
+                    """
+                    page, processed_clusters, processed_cells = (
+                        self.postprocess_on_page_image(
+                            page=page, clusters=predicted_clusters
+                        )
+                    )
+
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings(
+                            "ignore",
+                            "Mean of empty slice|invalid value encountered in scalar divide",
+                            RuntimeWarning,
+                            "numpy",
+                        )
+
+                        conv_res.confidence.pages[page.page_no].layout_score = float(
+                            np.mean([c.confidence for c in processed_clusters])
+                        )
+
+                        conv_res.confidence.pages[page.page_no].ocr_score = float(
+                            np.mean(
+                                [c.confidence for c in processed_cells if c.from_ocr]
+                            )
+                        )

                if settings.debug.visualize_layout:
                    self.draw_clusters_and_cells_side_by_side(
@ -217,3 +247,34 @@ class LayoutModel(BasePageModel):
                    )

                yield page
+
+    def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
+        pred_items = self.layout_predictor.predict(page_image)
+
+        clusters = []
+        for ix, pred_item in enumerate(pred_items):
+            label = DocItemLabel(
+                pred_item["label"].lower().replace(" ", "_").replace("-", "_")
+            )  # Temporary, until docling-ibm-model uses docling-core types
+            cluster = Cluster(
+                id=ix,
+                label=label,
+                confidence=pred_item["confidence"],
+                bbox=BoundingBox.model_validate(pred_item),
+                cells=[],
+            )
+            clusters.append(cluster)
+
+        return clusters
+
+    def postprocess_on_page_image(
+        self, *, page: Page, clusters: list[Cluster]
+    ) -> tuple[Page, list[Cluster], list[TextCell]]:
+        processed_clusters, processed_cells = LayoutPostprocessor(
+            page, clusters, self.options
+        ).postprocess()
+        # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
+
+        page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
+
+        return page, processed_clusters, processed_cells
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@ -5,17 +5,19 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Any, Optional

+from PIL import Image
+
 from docling.datamodel.accelerator_options import (
    AcceleratorOptions,
 )
-from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import (
    InlineVlmOptions,
    TransformersModelType,
    TransformersPromptStyle,
 )
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BasePageModel, BaseVlmModel
 from docling.models.utils.hf_model_download import (
    HuggingFaceModelDownloadMixin,
 )
@ -25,7 +27,7 @@ from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)


-class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
+class HuggingFaceTransformersVlmModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
    def __init__(
        self,
        enabled: bool,
@ -37,6 +39,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix

        self.vlm_options = vlm_options

+        self.scale = self.vlm_options.scale
+        self.max_size = 512
+        if isinstance(self.vlm_options.max_size, int):
+            self.max_size = self.vlm_options.max_size
+
        if self.enabled:
            import torch
            from transformers import (
@ -119,6 +126,43 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
            # Load generation config
            self.generation_config = GenerationConfig.from_pretrained(artifacts_path)

+    def get_user_prompt(self, page: Optional[Page]) -> str:
+        # Define prompt structure
+        user_prompt = ""
+        if callable(self.vlm_options.prompt) and page is not None:
+            user_prompt = self.vlm_options.prompt(page.parsed_page)
+        elif isinstance(self.vlm_options.prompt, str):
+            user_prompt = self.vlm_options.prompt
+
+        prompt = self.formulate_prompt(user_prompt)
+        return prompt
+
+    def predict_on_page_image(
+        self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
+    ) -> tuple[str, Optional[list[VlmPredictionToken]]]:
+        output = ""
+
+        inputs = self.processor(
+            text=prompt, images=[page_image], return_tensors="pt"
+        ).to(self.device)
+
+        # Call model to generate:
+        generated_ids = self.vlm_model.generate(
+            **inputs,
+            max_new_tokens=self.max_new_tokens,
+            use_cache=self.use_cache,
+            temperature=self.temperature,
+            generation_config=self.generation_config,
+            **self.vlm_options.extra_generation_config,
+        )
+
+        output = self.processor.batch_decode(
+            generated_ids[:, inputs["input_ids"].shape[1] :],
+            skip_special_tokens=False,
+        )[0]
+
+        return output, []
+
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
@ -130,22 +174,29 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                with TimeRecorder(conv_res, "vlm"):
                    assert page.size is not None

-                    hi_res_image = page.get_image(
+                    page_image = page.get_image(
                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                    )

+                    assert page_image is not None
+
                    # Define prompt structure
+                    """
                    if callable(self.vlm_options.prompt):
                        user_prompt = self.vlm_options.prompt(page.parsed_page)
                    else:
                        user_prompt = self.vlm_options.prompt
                    prompt = self.formulate_prompt(user_prompt)
-
-                    inputs = self.processor(
-                        text=prompt, images=[hi_res_image], return_tensors="pt"
-                    ).to(self.device)
+                    """
+                    prompt = self.get_user_prompt(page=page)

                    start_time = time.time()
+
+                    """
+                    inputs = self.processor(
+                        text=prompt, images=[page_image], return_tensors="pt"
+                    ).to(self.device)
+
                    # Call model to generate:
                    generated_ids = self.vlm_model.generate(
                        **inputs,
@ -166,9 +217,14 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                    _log.debug(
                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                    )
+                    """
+                    generated_text = self.predict_on_page_image(
+                        page_image=page_image, prompt=prompt, output_tokens=False
+                    )
+
                    page.predictions.vlm_response = VlmPrediction(
-                        text=generated_texts,
-                        generation_time=generation_time,
+                        text=generated_text,
+                        generation_time=time.time() - start_time,
                    )

                yield page
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@ -4,13 +4,15 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional

+from PIL import Image
+
 from docling.datamodel.accelerator_options import (
    AcceleratorOptions,
 )
 from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BasePageModel, BaseVlmModel
 from docling.models.utils.hf_model_download import (
    HuggingFaceModelDownloadMixin,
 )
@ -19,7 +21,7 @@ from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)


-class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
+class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
    def __init__(
        self,
        enabled: bool,
@ -28,10 +30,15 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
        vlm_options: InlineVlmOptions,
    ):
        self.enabled = enabled
-
        self.vlm_options = vlm_options
+
        self.max_tokens = vlm_options.max_new_tokens
        self.temperature = vlm_options.temperature
+        self.scale = self.vlm_options.scale
+
+        self.max_size = 512
+        if isinstance(self.vlm_options.max_size, int):
+            self.max_size = self.vlm_options.max_size

        if self.enabled:
            try:
@ -60,6 +67,55 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
            self.vlm_model, self.processor = load(artifacts_path)
            self.config = load_config(artifacts_path)

+    def get_user_prompt(self, page: Optional[Page]) -> str:
+        if callable(self.vlm_options.prompt) and page is not None:
+            return self.vlm_options.prompt(page.parsed_page)
+        else:
+            user_prompt = self.vlm_options.prompt
+            prompt = self.apply_chat_template(
+                self.processor, self.config, user_prompt, num_images=1
+            )
+            return prompt
+
+    def predict_on_page_image(
+        self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
+    ) -> tuple[str, Optional[list[VlmPredictionToken]]]:
+        tokens = []
+        output = ""
+        for token in self.stream_generate(
+            self.vlm_model,
+            self.processor,
+            prompt,
+            [page_image],
+            max_tokens=self.max_tokens,
+            verbose=False,
+            temp=self.temperature,
+        ):
+            if len(token.logprobs.shape) == 1:
+                tokens.append(
+                    VlmPredictionToken(
+                        text=token.text,
+                        token=token.token,
+                        logprob=token.logprobs[token.token],
+                    )
+                )
+            elif len(token.logprobs.shape) == 2 and token.logprobs.shape[0] == 1:
+                tokens.append(
+                    VlmPredictionToken(
+                        text=token.text,
+                        token=token.token,
+                        logprob=token.logprobs[0, token.token],
+                    )
+                )
+            else:
+                _log.warning(f"incompatible shape for logprobs: {token.logprobs.shape}")
+
+            output += token.text
+            if "</doctag>" in token.text:
+                break
+
+        return output, tokens
+
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
@ -71,19 +127,23 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
                    assert page.size is not None

-                    hi_res_image = page.get_image(
+                    page_image = page.get_image(
                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                    )
-                    if hi_res_image is not None:
-                        im_width, im_height = hi_res_image.size
+                    """
+                    if page_image is not None:
+                        im_width, im_height = page_image.size
+                    """
+                    assert page_image is not None

                    # populate page_tags with predicted doc tags
                    page_tags = ""

-                    if hi_res_image:
-                        if hi_res_image.mode != "RGB":
-                            hi_res_image = hi_res_image.convert("RGB")
+                    if page_image:
+                        if page_image.mode != "RGB":
+                            page_image = page_image.convert("RGB")

+                    """
                    if callable(self.vlm_options.prompt):
                        user_prompt = self.vlm_options.prompt(page.parsed_page)
                    else:
@ -91,11 +151,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                    prompt = self.apply_chat_template(
                        self.processor, self.config, user_prompt, num_images=1
                    )
-
-                    start_time = time.time()
-                    _log.debug("start generating ...")
+                    """
+                    prompt = self.get_user_prompt(page)

                    # Call model to generate:
+                    start_time = time.time()
+                    """
                    tokens: list[VlmPredictionToken] = []

                    output = ""
@ -103,7 +164,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                        self.vlm_model,
                        self.processor,
                        prompt,
-                        [hi_res_image],
+                        [page_image],
                        max_tokens=self.max_tokens,
                        verbose=False,
                        temp=self.temperature,
@ -135,13 +196,20 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                        output += token.text
                        if "</doctag>" in token.text:
                            break
+                    """
+                    output, tokens = self.predict_on_page_image(
+                        page_image=page_image, prompt=prompt, output_tokens=True
+                    )

                    generation_time = time.time() - start_time
                    page_tags = output

+                    """
                    _log.debug(
                        f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
                    )
+                    """
+
                    page.predictions.vlm_response = VlmPrediction(
                        text=page_tags,
                        generation_time=generation_time,
--- a/docling/models/vlm_models_inline/two_stage_vlm_model.py
+++ b/docling/models/vlm_models_inline/two_stage_vlm_model.py
@ -0,0 +1,119 @@
+import importlib.metadata
+import logging
+import time
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any, Optional
+
+from docling.datamodel.accelerator_options import (
+    AcceleratorOptions,
+)
+from docling.datamodel.base_models import Cluster, Page, VlmPrediction
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options_vlm_model import (
+    InlineVlmOptions,
+    TransformersModelType,
+    TransformersPromptStyle,
+)
+from docling.models.base_model import BaseLayoutModel, BasePageModel, BaseVlmModel
+from docling.models.layout_model import LayoutModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
+    def __init__(
+        self,
+        *,
+        layout_model: BaseLayoutModel,
+        vlm_model: BaseVlmModel,
+    ):
+        self.layout_model = layout_model
+        self.vlm_model = vlm_model
+
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "two-staged-vlm"):
+                    assert page.size is not None
+
+                    page_image = page.get_image(
+                        scale=self.vlm_model.scale, max_size=self.vlm_model.max_size
+                    )
+                    assert page_image is not None
+
+                    pred_clusters = self.layout_model.predict_on_page_image(
+                        page_image=page_image
+                    )
+
+                    page, processed_clusters, processed_cells = (
+                        self.layout_model.postprocess_on_page_image(
+                            page=page, clusters=pred_clusters
+                        )
+                    )
+
+                    user_prompt = self.vlm_model.get_user_prompt(page=page)
+                    prompt = self.formulate_prompt(
+                        user_prompt=user_prompt,
+                        clusters=processed_clusters,
+                        image_width=page_image.width,
+                        image_height=page_image.height,
+                    )
+
+                    start_time = time.time()
+                    generated_text, generated_tokens = (
+                        self.vlm_model.predict_on_page_image(
+                            page_image=page_image, prompt=prompt
+                        )
+                    )
+                    print("generated-text: \n", generated_text, "\n")
+                    page.predictions.vlm_response = VlmPrediction(
+                        text=generated_text,
+                        generation_time=time.time() - start_time,
+                        generated_tokens=generated_tokens,
+                    )
+                    exit(-1)
+
+                yield page
+
+    def formulate_prompt(
+        self,
+        *,
+        user_prompt: str,
+        clusters: list[Cluster],
+        image_width: int,
+        image_height: int,
+        vlm_width: int = 512,
+        vlm_height: int = 512,
+    ) -> str:
+        """Formulate a prompt for the VLM."""
+
+        known_clusters = ["here is a list of unsorted text-blocks:", "<doctags>"]
+        for cluster in clusters:
+            print(" => ", cluster)
+
+            loc_l = f"<loc_{int(vlm_width * cluster.bbox.l / image_width)}>"
+            loc_b = f"<loc_{int(vlm_height * cluster.bbox.b / image_height)}>"
+            loc_r = f"<loc_{int(vlm_width * cluster.bbox.r / image_width)}>"
+            loc_t = f"<loc_{int(vlm_height * cluster.bbox.t / image_height)}>"
+
+            known_clusters.append(
+                f"<{cluster.label}>{loc_l}{loc_b}{loc_r}{loc_t}</{cluster.label}>"
+            )
+
+        known_clusters.append("</doctags>")
+
+        user_prompt = "\n".join(known_clusters) + f"\n\n{user_prompt}"
+        print("user-prompt: ", user_prompt, "\n")
+
+        return user_prompt
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -26,21 +26,24 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import (
-    VlmPipelineOptions,
-)
+from docling.datamodel.pipeline_options import LayoutOptions, VlmPipelineOptions
 from docling.datamodel.pipeline_options_vlm_model import (
    ApiVlmOptions,
    InferenceFramework,
    InlineVlmOptions,
    ResponseFormat,
+    TwoStageVlmOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel
+from docling.models.layout_model import LayoutModel
 from docling.models.vlm_models_inline.hf_transformers_model import (
    HuggingFaceTransformersVlmModel,
 )
 from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
+from docling.models.vlm_models_inline.two_stage_vlm_model import (
+    TwoStageVlmModel,
+)
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder

@ -107,6 +110,53 @@ class VlmPipeline(PaginatedPipeline):
                raise ValueError(
                    f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
                )
+        elif isinstance(self.pipeline_options.vlm_options, TwoStageVlmOptions):
+            twostagevlm_options = cast(
+                TwoStageVlmOptions, self.pipeline_options.vlm_options
+            )
+
+            stage_1_options = twostagevlm_options.layout_options
+            stage_2_options = twostagevlm_options.vlm_options
+
+            layout_model = LayoutModel(
+                artifacts_path=artifacts_path,
+                accelerator_options=pipeline_options.accelerator_options,
+                options=LayoutOptions(
+                    create_orphan_clusters=False, model_spec=stage_1_options
+                ),
+            )
+
+            if (
+                isinstance(stage_2_options, InlineVlmOptions)
+                and stage_2_options.inference_framework == InferenceFramework.MLX
+            ):
+                vlm_model_mlx = HuggingFaceMlxModel(
+                    enabled=True,  # must be always enabled for this pipeline to make sense.
+                    artifacts_path=artifacts_path,
+                    accelerator_options=pipeline_options.accelerator_options,
+                    vlm_options=stage_2_options,
+                )
+                self.build_pipe = [
+                    TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx)
+                ]
+            elif (
+                isinstance(stage_2_options, InlineVlmOptions)
+                and stage_2_options.inference_framework
+                == InferenceFramework.TRANSFORMERS
+            ):
+                vlm_model_hf = HuggingFaceTransformersVlmModel(
+                    enabled=True,  # must be always enabled for this pipeline to make sense.
+                    artifacts_path=artifacts_path,
+                    accelerator_options=pipeline_options.accelerator_options,
+                    vlm_options=stage_2_options,
+                )
+                self.build_pipe = [
+                    TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf)
+                ]
+            else:
+                raise ValueError(
+                    f"Could not instantiate the right type of VLM pipeline: {stage_2_options}"
+                )

        self.enrichment_pipe = [
            # Other models working on `NodeItem` elements in the DoclingDocument
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
@ -213,10 +213,10 @@
      "prov": [
        {
          "bbox": [
-            139.66741943359375,
+            139.6674041748047,
            322.5054626464844,
            475.00927734375,
-            454.45458984375
+            454.4546203613281
          ],
          "page": 1,
          "span": [
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
@ -2705,7 +2705,7 @@
              "b": 102.78223000000003,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9373534917831421,
+            "confidence": 0.9373533129692078,
            "cells": [
              {
                "index": 0,
@ -2745,7 +2745,7 @@
              "b": 102.78223000000003,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.8858680725097656,
+            "confidence": 0.8858679533004761,
            "cells": [
              {
                "index": 1,
@ -2785,7 +2785,7 @@
              "b": 152.90697999999998,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9806433916091919,
+            "confidence": 0.9806435108184814,
            "cells": [
              {
                "index": 2,
@ -2940,7 +2940,7 @@
              "b": 255.42400999999995,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.98504239320755,
+            "confidence": 0.9850425124168396,
            "cells": [
              {
                "index": 7,
@ -3155,7 +3155,7 @@
              "b": 327.98218,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9591909050941467,
+            "confidence": 0.9591907262802124,
            "cells": [
              {
                "index": 15,
@ -3339,8 +3339,8 @@
            "id": 0,
            "label": "table",
            "bbox": {
-              "l": 139.66741943359375,
-              "t": 337.54541015625,
+              "l": 139.6674041748047,
+              "t": 337.5453796386719,
              "r": 475.00927734375,
              "b": 469.4945373535156,
              "coord_origin": "TOPLEFT"
@ -7846,7 +7846,7 @@
              "b": 518.17419,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9589294195175171,
+            "confidence": 0.9589295387268066,
            "cells": [
              {
                "index": 91,
@ -7911,7 +7911,7 @@
              "b": 618.3,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9849975109100342,
+            "confidence": 0.9849976301193237,
            "cells": [
              {
                "index": 93,
@ -8243,8 +8243,8 @@
              "id": 0,
              "label": "table",
              "bbox": {
-                "l": 139.66741943359375,
-                "t": 337.54541015625,
+                "l": 139.6674041748047,
+                "t": 337.5453796386719,
                "r": 475.00927734375,
                "b": 469.4945373535156,
                "coord_origin": "TOPLEFT"
@ -13641,7 +13641,7 @@
              "b": 102.78223000000003,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9373534917831421,
+            "confidence": 0.9373533129692078,
            "cells": [
              {
                "index": 0,
@ -13687,7 +13687,7 @@
              "b": 102.78223000000003,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.8858680725097656,
+            "confidence": 0.8858679533004761,
            "cells": [
              {
                "index": 1,
@ -13733,7 +13733,7 @@
              "b": 152.90697999999998,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9806433916091919,
+            "confidence": 0.9806435108184814,
            "cells": [
              {
                "index": 2,
@ -13900,7 +13900,7 @@
              "b": 255.42400999999995,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.98504239320755,
+            "confidence": 0.9850425124168396,
            "cells": [
              {
                "index": 7,
@ -14121,7 +14121,7 @@
              "b": 327.98218,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9591909050941467,
+            "confidence": 0.9591907262802124,
            "cells": [
              {
                "index": 15,
@ -14311,8 +14311,8 @@
            "id": 0,
            "label": "table",
            "bbox": {
-              "l": 139.66741943359375,
-              "t": 337.54541015625,
+              "l": 139.6674041748047,
+              "t": 337.5453796386719,
              "r": 475.00927734375,
              "b": 469.4945373535156,
              "coord_origin": "TOPLEFT"
@ -19701,7 +19701,7 @@
              "b": 518.17419,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9589294195175171,
+            "confidence": 0.9589295387268066,
            "cells": [
              {
                "index": 91,
@ -19772,7 +19772,7 @@
              "b": 618.3,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9849975109100342,
+            "confidence": 0.9849976301193237,
            "cells": [
              {
                "index": 93,
@ -20116,7 +20116,7 @@
              "b": 152.90697999999998,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9806433916091919,
+            "confidence": 0.9806435108184814,
            "cells": [
              {
                "index": 2,
@ -20283,7 +20283,7 @@
              "b": 255.42400999999995,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.98504239320755,
+            "confidence": 0.9850425124168396,
            "cells": [
              {
                "index": 7,
@ -20504,7 +20504,7 @@
              "b": 327.98218,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9591909050941467,
+            "confidence": 0.9591907262802124,
            "cells": [
              {
                "index": 15,
@ -20694,8 +20694,8 @@
            "id": 0,
            "label": "table",
            "bbox": {
-              "l": 139.66741943359375,
-              "t": 337.54541015625,
+              "l": 139.6674041748047,
+              "t": 337.5453796386719,
              "r": 475.00927734375,
              "b": 469.4945373535156,
              "coord_origin": "TOPLEFT"
@ -26084,7 +26084,7 @@
              "b": 518.17419,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9589294195175171,
+            "confidence": 0.9589295387268066,
            "cells": [
              {
                "index": 91,
@ -26155,7 +26155,7 @@
              "b": 618.3,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9849975109100342,
+            "confidence": 0.9849976301193237,
            "cells": [
              {
                "index": 93,
@ -26499,7 +26499,7 @@
              "b": 102.78223000000003,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.9373534917831421,
+            "confidence": 0.9373533129692078,
            "cells": [
              {
                "index": 0,
@ -26545,7 +26545,7 @@
              "b": 102.78223000000003,
              "coord_origin": "TOPLEFT"
            },
-            "confidence": 0.8858680725097656,
+            "confidence": 0.8858679533004761,
            "cells": [
              {
                "index": 1,