Move to pipeline_options.layout_options.model

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-25 19:44:34 +00:00 · 2025-07-08 11:24:06 +02:00 · 2025-07-08 11:24:06 +02:00 · af0461e5b1
commit af0461e5b1
parent f2094f858b a07ba863c4
19 changed files with 327 additions and 63 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,20 @@
+## [v2.40.0](https://github.com/docling-project/docling/releases/tag/v2.40.0) - 2025-07-04
+
+### Feature
+
+* Introduce LayoutOptions to control layout postprocessing behaviour ([#1870](https://github.com/docling-project/docling/issues/1870)) ([`ec6cf6f`](https://github.com/docling-project/docling/commit/ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb))
+* Integrate ListItemMarkerProcessor into document assembly ([#1825](https://github.com/docling-project/docling/issues/1825)) ([`56a0e10`](https://github.com/docling-project/docling/commit/56a0e104f76c5ba30ac0fcd247be61f911b560c1))
+
+### Fix
+
+* Secure torch model inits with global locks ([#1884](https://github.com/docling-project/docling/issues/1884)) ([`598c9c5`](https://github.com/docling-project/docling/commit/598c9c53d401de6aac89b7c51bccd57160dace1e))
+* Ensure that TesseractOcrModel does not crash in case OSD is not installed ([#1866](https://github.com/docling-project/docling/issues/1866)) ([`ae39a94`](https://github.com/docling-project/docling/commit/ae39a9411a09b2165ac745af358dea644f868e26))
+
+### Performance
+
+* **msexcel:** _find_table_bounds use iter_rows/iter_cols instead of Worksheet.cell ([#1875](https://github.com/docling-project/docling/issues/1875)) ([`13865c0`](https://github.com/docling-project/docling/commit/13865c06f5c564b9e57f3dbb60d26e60c75258b6))
+* Move expensive imports closer to usage ([#1863](https://github.com/docling-project/docling/issues/1863)) ([`3089cf2`](https://github.com/docling-project/docling/commit/3089cf2d26918eed4007398a528f53971c19f839))
+
 ## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27

 ### Feature
--- a/docling/datamodel/asr_model_specs.py
+++ b/docling/datamodel/asr_model_specs.py
@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
--- a/docling/datamodel/layout_model_specs.py
+++ b/docling/datamodel/layout_model_specs.py
@ -0,0 +1,91 @@
+import logging
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+
+from pydantic import BaseModel
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+
+_log = logging.getLogger(__name__)
+
+
+class LayoutModelConfig(BaseModel):
+    name: str
+    repo_id: str
+    revision: str
+    model_path: str
+    supported_devices: list[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ]
+
+    @property
+    def model_repo_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+
+
+# HuggingFace Layout Models
+
+# Default Docling Layout Model
+DOCLING_LAYOUT_V2 = LayoutModelConfig(
+    name="docling_layout_old",
+    repo_id="ds4sd/docling-layout-old",
+    revision="main",
+    model_path="",
+)
+
+DOCLING_LAYOUT_HERON = LayoutModelConfig(
+    name="docling_layout_heron",
+    repo_id="ds4sd/docling-layout-heron",
+    revision="main",
+    model_path="",
+)
+
+DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
+    name="docling_layout_heron_101",
+    repo_id="ds4sd/docling-layout-heron-101",
+    revision="main",
+    model_path="",
+)
+
+DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
+    name="docling_layout_egret_medium",
+    repo_id="ds4sd/docling-layout-egret-medium",
+    revision="main",
+    model_path="",
+)
+
+DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
+    name="docling_layout_egret_large",
+    repo_id="ds4sd/docling-layout-egret-large",
+    revision="main",
+    model_path="",
+)
+
+DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
+    name="docling_layout_egret_xlarge",
+    repo_id="ds4sd/docling-layout-egret-xlarge",
+    revision="main",
+    model_path="",
+)
+
+# Example for a hypothetical alternative model
+# ALTERNATIVE_LAYOUT = LayoutModelConfig(
+#     name="alternative_layout",
+#     repo_id="someorg/alternative-layout",
+#     revision="main",
+#     model_path="model_artifacts/layout_alt",
+# )
+
+
+class LayoutModelType(str, Enum):
+    DOCLING_LAYOUT_V2 = "docling_layout_v2"
+    DOCLING_LAYOUT_OLD = "docling_layout_old"
+    DOCLING_LAYOUT_HERON = "docling_layout_heron"
+    DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
+    DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
+    DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
+    DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
+    # ALTERNATIVE_LAYOUT = "alternative_layout"
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -1,4 +1,5 @@
 import logging
+from datetime import datetime
 from enum import Enum
 from pathlib import Path
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@ -274,6 +275,13 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
    )


+class LayoutOptions(BaseModel):
+    """Options for layout processing."""
+
+    create_orphan_clusters: bool = True  # Whether to create clusters for orphaned cells
+    model: LayoutModelConfig = DOCLING_LAYOUT_V2
+
+
 class AsrPipelineOptions(PipelineOptions):
    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
    artifacts_path: Optional[Union[Path, str]] = None
@ -298,6 +306,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
    picture_description_options: PictureDescriptionBaseOptions = (
        smolvlm_picture_description
    )
+    layout_options: LayoutOptions = LayoutOptions()

    images_scale: float = 1.0
    generate_page_images: bool = False
@ -315,8 +324,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
        True  # Always True since parsed_page is now mandatory
    )

-    layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2
-

 class ProcessingPipeline(str, Enum):
    STANDARD = "standard"
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@ -1,6 +1,7 @@
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Union

+from docling_core.types.doc.page import SegmentedPage
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated

@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice

 class BaseVlmOptions(BaseModel):
    kind: str
-    prompt: str
+    prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
    scale: float = 2.0
    max_size: Optional[int] = None
+    temperature: float = 0.0


 class ResponseFormat(str, Enum):
@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
    AUTOMODEL = "automodel"
    AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
    AUTOMODEL_CAUSALLM = "automodel-causallm"
+    AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
+
+
+class TransformersPromptStyle(str, Enum):
+    CHAT = "chat"
+    RAW = "raw"


 class InlineVlmOptions(BaseVlmOptions):
@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):

    inference_framework: InferenceFramework
    transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
+    transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
    response_format: ResponseFormat

    torch_dtype: Optional[str] = None
@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
        AcceleratorDevice.MPS,
    ]

-    temperature: float = 0.0
    stop_strings: List[str] = []
    extra_generation_config: Dict[str, Any] = {}

--- a/docling/models/api_vlm_model.py
+++ b/docling/models/api_vlm_model.py
@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):

            self.timeout = self.vlm_options.timeout
            self.concurrency = self.vlm_options.concurrency
-            self.prompt_content = (
-                f"This is a page from a document.\n{self.vlm_options.prompt}"
-            )
            self.params = {
                **self.vlm_options.params,
-                "temperature": 0,
+                "temperature": self.vlm_options.temperature,
            }

    def __call__(
@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
                        if hi_res_image.mode != "RGB":
                            hi_res_image = hi_res_image.convert("RGB")

+                    if callable(self.vlm_options.prompt):
+                        prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        prompt = self.vlm_options.prompt
+
                    page_tags = api_image_request(
                        image=hi_res_image,
-                        prompt=self.prompt_content,
+                        prompt=prompt,
                        url=self.vlm_options.url,
                        timeout=self.timeout,
                        headers=self.vlm_options.headers,
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@ -14,7 +14,8 @@ from PIL import Image
 from pydantic import BaseModel

 from docling.datamodel.accelerator_options import AcceleratorOptions
-from docling.models.base_model import BaseEnrichmentModel
+from docling.datamodel.base_models import ItemAndImageEnrichmentElement
+from docling.models.base_model import BaseItemAndImageEnrichmentModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device

@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
    kind: Literal["document_picture_classifier"] = "document_picture_classifier"


-class DocumentPictureClassifier(BaseEnrichmentModel):
+class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
    """
    A model for classifying pictures in documents.

@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
    def __call__(
        self,
        doc: DoclingDocument,
-        element_batch: Iterable[NodeItem],
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
    ) -> Iterable[NodeItem]:
        """
        Processes a batch of elements and enriches them with classification predictions.
@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
        ----------
        doc : DoclingDocument
            The document containing the elements to be processed.
-        element_batch : Iterable[NodeItem]
+        element_batch : Iterable[ItemAndImageEnrichmentElement]
            A batch of pictures to classify.

        Returns
@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
        """
        if not self.enabled:
            for element in element_batch:
-                yield element
+                yield element.item
            return

        images: List[Union[Image.Image, np.ndarray]] = []
        elements: List[PictureItem] = []
        for el in element_batch:
-            assert isinstance(el, PictureItem)
-            elements.append(el)
-            img = el.get_image(doc)
-            assert img is not None
-            images.append(img)
+            assert isinstance(el.item, PictureItem)
+            elements.append(el.item)
+            images.append(el.image)

        outputs = self.document_picture_classifier.predict(images)

-        for element, output in zip(elements, outputs):
-            element.annotations.append(
+        for item, output in zip(elements, outputs):
+            item.annotations.append(
                PictureClassificationData(
                    provenance="DocumentPictureClassifier",
                    predicted_classes=[
@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
                )
            )

-            yield element
+            yield item
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -13,6 +13,7 @@ from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
+from docling.datamodel.pipeline_options import LayoutOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
@ -49,12 +50,14 @@ class LayoutModel(BasePageModel):
        self,
        artifacts_path: Optional[Path],
        accelerator_options: AcceleratorOptions,
-        layout_model_config: LayoutModelConfig,
+        options: LayoutOptions,
    ):
        from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor

+        self.options = options
+
        device = decide_device(accelerator_options.device)
-        self.layout_model_config = layout_model_config
+        layout_model_config = options.model
        model_repo_folder = layout_model_config.model_repo_folder
        model_path = layout_model_config.model_path

@ -182,7 +185,7 @@ class LayoutModel(BasePageModel):
                    # Apply postprocessing

                    processed_clusters, processed_cells = LayoutPostprocessor(
-                        page, clusters
+                        page, clusters, self.options
                    ).postprocess()
                    # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally

--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import (
    InlineVlmOptions,
    TransformersModelType,
+    TransformersPromptStyle,
 )
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import (
@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
            from transformers import (
                AutoModel,
                AutoModelForCausalLM,
+                AutoModelForImageTextToText,
                AutoModelForVision2Seq,
                AutoProcessor,
                BitsAndBytesConfig,
@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                == TransformersModelType.AUTOMODEL_VISION2SEQ
            ):
                model_cls = AutoModelForVision2Seq
+            elif (
+                self.vlm_options.transformers_model_type
+                == TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
+            ):
+                model_cls = AutoModelForImageTextToText

            self.processor = AutoProcessor.from_pretrained(
                artifacts_path,
@ -128,7 +135,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                    )

                    # Define prompt structure
-                    prompt = self.formulate_prompt()
+                    if callable(self.vlm_options.prompt):
+                        user_prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        user_prompt = self.vlm_options.prompt
+                    prompt = self.formulate_prompt(user_prompt)

                    inputs = self.processor(
                        text=prompt, images=[hi_res_image], return_tensors="pt"
@ -162,10 +173,13 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix

                yield page

-    def formulate_prompt(self) -> str:
+    def formulate_prompt(self, user_prompt: str) -> str:
        """Formulate a prompt for the VLM."""

-        if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
+        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
+            return user_prompt
+
+        elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
            _log.debug("Using specialized prompt for Phi-4")
            # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally

@ -173,25 +187,30 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
            assistant_prompt = "<|assistant|>"
            prompt_suffix = "<|end|>"

-            prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
+            prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")

            return prompt

-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "This is a page from a document.",
-                    },
-                    {"type": "image"},
-                    {"type": "text", "text": self.vlm_options.prompt},
-                ],
-            }
-        ]
-        prompt = self.processor.apply_chat_template(
-            messages, add_generation_prompt=False
+        elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "This is a page from a document.",
+                        },
+                        {"type": "image"},
+                        {"type": "text", "text": user_prompt},
+                    ],
+                }
+            ]
+            prompt = self.processor.apply_chat_template(
+                messages, add_generation_prompt=False
+            )
+            return prompt
+
+        raise RuntimeError(
+            f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
        )
-        return prompt
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder

-            self.param_question = vlm_options.prompt
-
            ## Load the model
            self.vlm_model, self.processor = load(artifacts_path)
            self.config = load_config(artifacts_path)
@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                        if hi_res_image.mode != "RGB":
                            hi_res_image = hi_res_image.convert("RGB")

+                    if callable(self.vlm_options.prompt):
+                        user_prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        user_prompt = self.vlm_options.prompt
                    prompt = self.apply_chat_template(
-                        self.processor, self.config, self.param_question, num_images=1
+                        self.processor, self.config, user_prompt, num_images=1
                    )

                    start_time = time.time()
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -81,7 +81,7 @@ class StandardPdfPipeline(PaginatedPipeline):
            LayoutModel(
                artifacts_path=artifacts_path,
                accelerator_options=pipeline_options.accelerator_options,
-                layout_model_config=pipeline_options.layout_model_config,
+                options=pipeline_options.layout_options,
            ),
            # Table structure model
            TableStructureModel(
@ -130,6 +130,7 @@ class StandardPdfPipeline(PaginatedPipeline):
        if (
            self.pipeline_options.do_formula_enrichment
            or self.pipeline_options.do_code_enrichment
+            or self.pipeline_options.do_picture_classification
            or self.pipeline_options.do_picture_description
        ):
            self.keep_backend = True
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
            page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
            if page._backend is not None and page._backend.is_valid():
                page.size = page._backend.get_size()
+                page.parsed_page = page._backend.get_segmented_page()

        return page

--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
 from rtree import index

 from docling.datamodel.base_models import BoundingBox, Cluster, Page
+from docling.datamodel.pipeline_options import LayoutOptions

 _log = logging.getLogger(__name__)

@ -194,12 +195,16 @@ class LayoutPostprocessor:
        DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
    }

-    def __init__(self, page: Page, clusters: List[Cluster]) -> None:
+    def __init__(
+        self, page: Page, clusters: List[Cluster], options: LayoutOptions
+    ) -> None:
        """Initialize processor with page and clusters."""
+
        self.cells = page.cells
        self.page = page
        self.page_size = page.size
        self.all_clusters = clusters
+        self.options = options
        self.regular_clusters = [
            c for c in clusters if c.label not in self.SPECIAL_TYPES
        ]
@ -267,7 +272,7 @@ class LayoutPostprocessor:

        # Handle orphaned cells
        unassigned = self._find_unassigned_cells(clusters)
-        if unassigned:
+        if unassigned and self.options.create_orphan_clusters:
            next_id = max((c.id for c in self.all_clusters), default=0) + 1
            orphan_clusters = []
            for i, cell in enumerate(unassigned):
--- a/docs/examples/compare_vlm_models.py
+++ b/docs/examples/compare_vlm_models.py
@ -14,11 +14,18 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
 from tabulate import tabulate

 from docling.datamodel import vlm_model_specs
+from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
 )
-from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
+from docling.datamodel.pipeline_options_vlm_model import (
+    InferenceFramework,
+    InlineVlmOptions,
+    ResponseFormat,
+    TransformersModelType,
+    TransformersPromptStyle,
+)
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

@ -101,6 +108,33 @@ if __name__ == "__main__":
    out_path = Path("scratch")
    out_path.mkdir(parents=True, exist_ok=True)

+    ## Definiton of more inline models
+    llava_qwen = InlineVlmOptions(
+        repo_id="llava-hf/llava-interleave-qwen-0.5b-hf",
+        # prompt="Read text in the image.",
+        prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+        # prompt="Parse the reading order of this document.",
+        response_format=ResponseFormat.MARKDOWN,
+        inference_framework=InferenceFramework.TRANSFORMERS,
+        transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+        supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
+        scale=2.0,
+        temperature=0.0,
+    )
+
+    # Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt.
+    dolphin_oneshot = InlineVlmOptions(
+        repo_id="ByteDance/Dolphin",
+        prompt="<s>Read text in the image. <Answer/>",
+        response_format=ResponseFormat.MARKDOWN,
+        inference_framework=InferenceFramework.TRANSFORMERS,
+        transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+        transformers_prompt_style=TransformersPromptStyle.RAW,
+        supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
+        scale=2.0,
+        temperature=0.0,
+    )
+
    ## Use VlmPipeline
    pipeline_options = VlmPipelineOptions()
    pipeline_options.generate_page_images = True
@ -121,6 +155,9 @@ if __name__ == "__main__":
        vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
        vlm_model_specs.PHI4_TRANSFORMERS,
        vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
+        ## More inline models
+        dolphin_oneshot,
+        llava_qwen,
    ]

    # Remove MLX models if not on Mac
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@ -1,8 +1,10 @@
 import logging
 import os
 from pathlib import Path
+from typing import Optional

 import requests
+from docling_core.types.doc.page import SegmentedPage
 from dotenv import load_dotenv

 from docling.datamodel.base_models import InputFormat
@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
    return options


+#### Using LM Studio with OlmOcr model
+
+
+def lms_olmocr_vlm_options(model: str):
+    def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
+        if page is None:
+            return (
+                "Below is the image of one page of a document. Just return the plain text"
+                " representation of this document as if you were reading it naturally.\n"
+                "Do not hallucinate.\n"
+            )
+
+        anchor = [
+            f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
+        ]
+
+        for text_cell in page.textline_cells:
+            if not text_cell.text.strip():
+                continue
+            bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
+                page.dimension.height
+            )
+            anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")
+
+        for image_cell in page.bitmap_resources:
+            bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
+                page.dimension.height
+            )
+            anchor.append(
+                f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
+            )
+
+        if len(anchor) == 1:
+            anchor.append(
+                f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
+            )
+
+        # Original prompt uses cells sorting. We are skipping it in this demo.
+
+        base_text = "\n".join(anchor)
+
+        return (
+            f"Below is the image of one page of a document, as well as some raw textual"
+            f" content that was previously extracted for it. Just return the plain text"
+            f" representation of this document as if you were reading it naturally.\n"
+            f"Do not hallucinate.\n"
+            f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
+        )
+
+    options = ApiVlmOptions(
+        url="http://localhost:1234/v1/chat/completions",
+        params=dict(
+            model=model,
+        ),
+        prompt=_dynamic_olmocr_prompt,
+        timeout=90,
+        scale=1.0,
+        max_size=1024,  # from OlmOcr pipeline
+        response_format=ResponseFormat.MARKDOWN,
+    )
+    return options
+
+
 #### Using Ollama


@ -123,6 +188,12 @@ def main():
    #     format=ResponseFormat.MARKDOWN,
    # )

+    # Example using the OlmOcr (dynamic prompt) model with LM Studio:
+    # (uncomment the following lines)
+    # pipeline_options.vlm_options = lms_olmocr_vlm_options(
+    #     model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF",
+    # )
+
    # Example using the Granite Vision model with Ollama:
    # (uncomment the following lines)
    # pipeline_options.vlm_options = ollama_vlm_options(
--- a/docs/installation/index.md
+++ b/docs/installation/index.md
@ -77,7 +77,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
    === "RHEL"

        ```console
-        dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
+        dnf install tesseract tesseract-devel tesseract-langpack-eng tesseract-osd leptonica-devel
        TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
        echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
        ```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "docling"
-version = "2.39.0"  # DO NOT EDIT, updated automatically
+version = "2.40.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 license = "MIT"
 keywords = [
--- a/tests/test_document_picture_classifier.py
+++ b/tests/test_document_picture_classifier.py
@ -17,8 +17,9 @@ def get_converter():
    pipeline_options.do_table_structure = False
    pipeline_options.do_code_enrichment = False
    pipeline_options.do_formula_enrichment = False
+    pipeline_options.generate_picture_images = False
+    pipeline_options.generate_page_images = False
    pipeline_options.do_picture_classification = True
-    pipeline_options.generate_picture_images = True
    pipeline_options.images_scale = 2

    converter = DocumentConverter(
--- a/uv.lock
+++ b/uv.lock
@ -805,7 +805,7 @@ wheels = [

 [[package]]
 name = "docling"
-version = "2.39.0"
+version = "2.40.0"
 source = { editable = "." }
 dependencies = [
    { name = "beautifulsoup4" },