Move to pipeline_options.layout_options.model

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-07-08 11:24:06 +02:00 · 2025-07-08 11:24:06 +02:00 · af0461e5b1
commit af0461e5b1
parent f2094f858b a07ba863c4
19 changed files with 327 additions and 63 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,20 @@
 ## [v2.40.0](https://github.com/docling-project/docling/releases/tag/v2.40.0) - 2025-07-04
 ### Feature
 * Introduce LayoutOptions to control layout postprocessing behaviour ([#1870](https://github.com/docling-project/docling/issues/1870)) ([`ec6cf6f`](https://github.com/docling-project/docling/commit/ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb))
 * Integrate ListItemMarkerProcessor into document assembly ([#1825](https://github.com/docling-project/docling/issues/1825)) ([`56a0e10`](https://github.com/docling-project/docling/commit/56a0e104f76c5ba30ac0fcd247be61f911b560c1))
 ### Fix
 * Secure torch model inits with global locks ([#1884](https://github.com/docling-project/docling/issues/1884)) ([`598c9c5`](https://github.com/docling-project/docling/commit/598c9c53d401de6aac89b7c51bccd57160dace1e))
 * Ensure that TesseractOcrModel does not crash in case OSD is not installed ([#1866](https://github.com/docling-project/docling/issues/1866)) ([`ae39a94`](https://github.com/docling-project/docling/commit/ae39a9411a09b2165ac745af358dea644f868e26))
 ### Performance
 * **msexcel:** _find_table_bounds use iter_rows/iter_cols instead of Worksheet.cell ([#1875](https://github.com/docling-project/docling/issues/1875)) ([`13865c0`](https://github.com/docling-project/docling/commit/13865c06f5c564b9e57f3dbb60d26e60c75258b6))
 * Move expensive imports closer to usage ([#1863](https://github.com/docling-project/docling/issues/1863)) ([`3089cf2`](https://github.com/docling-project/docling/commit/3089cf2d26918eed4007398a528f53971c19f839))
 ## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27
 ### Feature
--- a/docling/datamodel/asr_model_specs.py
+++ b/docling/datamodel/asr_model_specs.py
@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
    verbose=True,
    timestamps=True,
    word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
    max_new_tokens=256,
    max_time_chunk=30.0,
 )
--- a/docling/datamodel/layout_model_specs.py
+++ b/docling/datamodel/layout_model_specs.py
@ -0,0 +1,91 @@
 import logging
 from enum import Enum
 from pathlib import Path
 from typing import Optional
 from pydantic import BaseModel
 from docling.datamodel.accelerator_options import AcceleratorDevice
 _log = logging.getLogger(__name__)
 class LayoutModelConfig(BaseModel):
    name: str
    repo_id: str
    revision: str
    model_path: str
    supported_devices: list[AcceleratorDevice] = [
        AcceleratorDevice.CPU,
        AcceleratorDevice.CUDA,
        AcceleratorDevice.MPS,
    ]
    @property
    def model_repo_folder(self) -> str:
        return self.repo_id.replace("/", "--")
 # HuggingFace Layout Models
 # Default Docling Layout Model
 DOCLING_LAYOUT_V2 = LayoutModelConfig(
    name="docling_layout_old",
    repo_id="ds4sd/docling-layout-old",
    revision="main",
    model_path="",
 )
 DOCLING_LAYOUT_HERON = LayoutModelConfig(
    name="docling_layout_heron",
    repo_id="ds4sd/docling-layout-heron",
    revision="main",
    model_path="",
 )
 DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
    name="docling_layout_heron_101",
    repo_id="ds4sd/docling-layout-heron-101",
    revision="main",
    model_path="",
 )
 DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
    name="docling_layout_egret_medium",
    repo_id="ds4sd/docling-layout-egret-medium",
    revision="main",
    model_path="",
 )
 DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
    name="docling_layout_egret_large",
    repo_id="ds4sd/docling-layout-egret-large",
    revision="main",
    model_path="",
 )
 DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
    name="docling_layout_egret_xlarge",
    repo_id="ds4sd/docling-layout-egret-xlarge",
    revision="main",
    model_path="",
 )
 # Example for a hypothetical alternative model
 # ALTERNATIVE_LAYOUT = LayoutModelConfig(
 #     name="alternative_layout",
 #     repo_id="someorg/alternative-layout",
 #     revision="main",
 #     model_path="model_artifacts/layout_alt",
 # )
 class LayoutModelType(str, Enum):
    DOCLING_LAYOUT_V2 = "docling_layout_v2"
    DOCLING_LAYOUT_OLD = "docling_layout_old"
    DOCLING_LAYOUT_HERON = "docling_layout_heron"
    DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
    DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
    DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
    DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
    # ALTERNATIVE_LAYOUT = "alternative_layout"
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -1,4 +1,5 @@
 import logging
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@ -274,6 +275,13 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
    )
 class LayoutOptions(BaseModel):
    """Options for layout processing."""
    create_orphan_clusters: bool = True  # Whether to create clusters for orphaned cells
    model: LayoutModelConfig = DOCLING_LAYOUT_V2
 class AsrPipelineOptions(PipelineOptions):
    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
    artifacts_path: Optional[Union[Path, str]] = None
@ -298,6 +306,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
    picture_description_options: PictureDescriptionBaseOptions = (
        smolvlm_picture_description
    )
    layout_options: LayoutOptions = LayoutOptions()
    images_scale: float = 1.0
    generate_page_images: bool = False
@ -315,8 +324,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
        True  # Always True since parsed_page is now mandatory
    )
    layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2
 class ProcessingPipeline(str, Enum):
    STANDARD = "standard"
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@ -1,6 +1,7 @@
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 from docling_core.types.doc.page import SegmentedPage
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
 class BaseVlmOptions(BaseModel):
    kind: str
-    prompt: str
+    prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
    scale: float = 2.0
    max_size: Optional[int] = None
    temperature: float = 0.0
 class ResponseFormat(str, Enum):
@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
    AUTOMODEL = "automodel"
    AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
    AUTOMODEL_CAUSALLM = "automodel-causallm"
    AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
 class TransformersPromptStyle(str, Enum):
    CHAT = "chat"
    RAW = "raw"
 class InlineVlmOptions(BaseVlmOptions):
@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
    inference_framework: InferenceFramework
    transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
    transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
    response_format: ResponseFormat
    torch_dtype: Optional[str] = None
@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
        AcceleratorDevice.MPS,
    ]
    temperature: float = 0.0
    stop_strings: List[str] = []
    extra_generation_config: Dict[str, Any] = {}
--- a/docling/models/api_vlm_model.py
+++ b/docling/models/api_vlm_model.py
@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
            self.timeout = self.vlm_options.timeout
            self.concurrency = self.vlm_options.concurrency
            self.prompt_content = (
                f"This is a page from a document.\n{self.vlm_options.prompt}"
            )
            self.params = {
                **self.vlm_options.params,
-                "temperature": 0,
+                "temperature": self.vlm_options.temperature,
            }
    def __call__(
@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
                        if hi_res_image.mode != "RGB":
                            hi_res_image = hi_res_image.convert("RGB")
                    if callable(self.vlm_options.prompt):
                        prompt = self.vlm_options.prompt(page.parsed_page)
                    else:
                        prompt = self.vlm_options.prompt
                    page_tags = api_image_request(
                        image=hi_res_image,
-                        prompt=self.prompt_content,
+                        prompt=prompt,
                        url=self.vlm_options.url,
                        timeout=self.timeout,
                        headers=self.vlm_options.headers,
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@ -14,7 +14,8 @@ from PIL import Image
 from pydantic import BaseModel
 from docling.datamodel.accelerator_options import AcceleratorOptions
-from docling.models.base_model import BaseEnrichmentModel
+from docling.datamodel.base_models import ItemAndImageEnrichmentElement
 from docling.models.base_model import BaseItemAndImageEnrichmentModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
    kind: Literal["document_picture_classifier"] = "document_picture_classifier"
-class DocumentPictureClassifier(BaseEnrichmentModel):
+class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
    """
    A model for classifying pictures in documents.
@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
    def __call__(
        self,
        doc: DoclingDocument,
-        element_batch: Iterable[NodeItem],
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
    ) -> Iterable[NodeItem]:
        """
        Processes a batch of elements and enriches them with classification predictions.
@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
        ----------
        doc : DoclingDocument
            The document containing the elements to be processed.
-        element_batch : Iterable[NodeItem]
+        element_batch : Iterable[ItemAndImageEnrichmentElement]
            A batch of pictures to classify.
        Returns
@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
        """
        if not self.enabled:
            for element in element_batch:
-                yield element
+                yield element.item
            return
        images: List[Union[Image.Image, np.ndarray]] = []
        elements: List[PictureItem] = []
        for el in element_batch:
-            assert isinstance(el, PictureItem)
+            assert isinstance(el.item, PictureItem)
-            elements.append(el)
+            elements.append(el.item)
-            img = el.get_image(doc)
+            images.append(el.image)
            assert img is not None
            images.append(img)
        outputs = self.document_picture_classifier.predict(images)
-        for element, output in zip(elements, outputs):
+        for item, output in zip(elements, outputs):
-            element.annotations.append(
+            item.annotations.append(
                PictureClassificationData(
                    provenance="DocumentPictureClassifier",
                    predicted_classes=[
@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
                )
            )
-            yield element
+            yield item
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -13,6 +13,7 @@ from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
 from docling.datamodel.pipeline_options import LayoutOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
@ -49,12 +50,14 @@ class LayoutModel(BasePageModel):
        self,
        artifacts_path: Optional[Path],
        accelerator_options: AcceleratorOptions,
-        layout_model_config: LayoutModelConfig,
+        options: LayoutOptions,
    ):
        from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
        self.options = options
        device = decide_device(accelerator_options.device)
-        self.layout_model_config = layout_model_config
+        layout_model_config = options.model
        model_repo_folder = layout_model_config.model_repo_folder
        model_path = layout_model_config.model_path
@ -182,7 +185,7 @@ class LayoutModel(BasePageModel):
                    # Apply postprocessing
                    processed_clusters, processed_cells = LayoutPostprocessor(
-                        page, clusters
+                        page, clusters, self.options
                    ).postprocess()
                    # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import (
    InlineVlmOptions,
    TransformersModelType,
    TransformersPromptStyle,
 )
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import (
@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
            from transformers import (
                AutoModel,
                AutoModelForCausalLM,
                AutoModelForImageTextToText,
                AutoModelForVision2Seq,
                AutoProcessor,
                BitsAndBytesConfig,
@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                == TransformersModelType.AUTOMODEL_VISION2SEQ
            ):
                model_cls = AutoModelForVision2Seq
            elif (
                self.vlm_options.transformers_model_type
                == TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
            ):
                model_cls = AutoModelForImageTextToText
            self.processor = AutoProcessor.from_pretrained(
                artifacts_path,
@ -128,7 +135,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                    )
                    # Define prompt structure
-                    prompt = self.formulate_prompt()
+                    if callable(self.vlm_options.prompt):
                        user_prompt = self.vlm_options.prompt(page.parsed_page)
                    else:
                        user_prompt = self.vlm_options.prompt
                    prompt = self.formulate_prompt(user_prompt)
                    inputs = self.processor(
                        text=prompt, images=[hi_res_image], return_tensors="pt"
@ -162,10 +173,13 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                yield page
-    def formulate_prompt(self) -> str:
+    def formulate_prompt(self, user_prompt: str) -> str:
        """Formulate a prompt for the VLM."""
-        if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
+        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
            return user_prompt
        elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
            _log.debug("Using specialized prompt for Phi-4")
            # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
@ -173,25 +187,30 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
            assistant_prompt = "<|assistant|>"
            prompt_suffix = "<|end|>"
-            prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
+            prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
            return prompt
-        messages = [
+        elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
-            {
+            messages = [
-                "role": "user",
+                {
-                "content": [
+                    "role": "user",
-                    {
+                    "content": [
-                        "type": "text",
+                        {
-                        "text": "This is a page from a document.",
+                            "type": "text",
-                    },
+                            "text": "This is a page from a document.",
-                    {"type": "image"},
+                        },
-                    {"type": "text", "text": self.vlm_options.prompt},
+                        {"type": "image"},
-                ],
+                        {"type": "text", "text": user_prompt},
-            }
+                    ],
-        ]
+                }
-        prompt = self.processor.apply_chat_template(
+            ]
-            messages, add_generation_prompt=False
+            prompt = self.processor.apply_chat_template(
                messages, add_generation_prompt=False
            )
            return prompt
        raise RuntimeError(
            f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
        )
        return prompt
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder
            self.param_question = vlm_options.prompt
            ## Load the model
            self.vlm_model, self.processor = load(artifacts_path)
            self.config = load_config(artifacts_path)
@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                        if hi_res_image.mode != "RGB":
                            hi_res_image = hi_res_image.convert("RGB")
                    if callable(self.vlm_options.prompt):
                        user_prompt = self.vlm_options.prompt(page.parsed_page)
                    else:
                        user_prompt = self.vlm_options.prompt
                    prompt = self.apply_chat_template(
-                        self.processor, self.config, self.param_question, num_images=1
+                        self.processor, self.config, user_prompt, num_images=1
                    )
                    start_time = time.time()
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -81,7 +81,7 @@ class StandardPdfPipeline(PaginatedPipeline):
            LayoutModel(
                artifacts_path=artifacts_path,
                accelerator_options=pipeline_options.accelerator_options,
-                layout_model_config=pipeline_options.layout_model_config,
+                options=pipeline_options.layout_options,
            ),
            # Table structure model
            TableStructureModel(
@ -130,6 +130,7 @@ class StandardPdfPipeline(PaginatedPipeline):
        if (
            self.pipeline_options.do_formula_enrichment
            or self.pipeline_options.do_code_enrichment
            or self.pipeline_options.do_picture_classification
            or self.pipeline_options.do_picture_description
        ):
            self.keep_backend = True
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
            page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
            if page._backend is not None and page._backend.is_valid():
                page.size = page._backend.get_size()
                page.parsed_page = page._backend.get_segmented_page()
        return page
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
 from rtree import index
 from docling.datamodel.base_models import BoundingBox, Cluster, Page
 from docling.datamodel.pipeline_options import LayoutOptions
 _log = logging.getLogger(__name__)
@ -194,12 +195,16 @@ class LayoutPostprocessor:
        DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
    }
-    def __init__(self, page: Page, clusters: List[Cluster]) -> None:
+    def __init__(
        self, page: Page, clusters: List[Cluster], options: LayoutOptions
    ) -> None:
        """Initialize processor with page and clusters."""
        self.cells = page.cells
        self.page = page
        self.page_size = page.size
        self.all_clusters = clusters
        self.options = options
        self.regular_clusters = [
            c for c in clusters if c.label not in self.SPECIAL_TYPES
        ]
@ -267,7 +272,7 @@ class LayoutPostprocessor:
        # Handle orphaned cells
        unassigned = self._find_unassigned_cells(clusters)
-        if unassigned:
+        if unassigned and self.options.create_orphan_clusters:
            next_id = max((c.id for c in self.all_clusters), default=0) + 1
            orphan_clusters = []
            for i, cell in enumerate(unassigned):
--- a/docs/examples/compare_vlm_models.py
+++ b/docs/examples/compare_vlm_models.py
@ -14,11 +14,18 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
 from tabulate import tabulate
 from docling.datamodel import vlm_model_specs
 from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
 )
-from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
+from docling.datamodel.pipeline_options_vlm_model import (
    InferenceFramework,
    InlineVlmOptions,
    ResponseFormat,
    TransformersModelType,
    TransformersPromptStyle,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
@ -101,6 +108,33 @@ if __name__ == "__main__":
    out_path = Path("scratch")
    out_path.mkdir(parents=True, exist_ok=True)
    ## Definiton of more inline models
    llava_qwen = InlineVlmOptions(
        repo_id="llava-hf/llava-interleave-qwen-0.5b-hf",
        # prompt="Read text in the image.",
        prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
        # prompt="Parse the reading order of this document.",
        response_format=ResponseFormat.MARKDOWN,
        inference_framework=InferenceFramework.TRANSFORMERS,
        transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
        supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
        scale=2.0,
        temperature=0.0,
    )
    # Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt.
    dolphin_oneshot = InlineVlmOptions(
        repo_id="ByteDance/Dolphin",
        prompt="<s>Read text in the image. <Answer/>",
        response_format=ResponseFormat.MARKDOWN,
        inference_framework=InferenceFramework.TRANSFORMERS,
        transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
        transformers_prompt_style=TransformersPromptStyle.RAW,
        supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
        scale=2.0,
        temperature=0.0,
    )
    ## Use VlmPipeline
    pipeline_options = VlmPipelineOptions()
    pipeline_options.generate_page_images = True
@ -121,6 +155,9 @@ if __name__ == "__main__":
        vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
        vlm_model_specs.PHI4_TRANSFORMERS,
        vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
        ## More inline models
        dolphin_oneshot,
        llava_qwen,
    ]
    # Remove MLX models if not on Mac
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@ -1,8 +1,10 @@
 import logging
 import os
 from pathlib import Path
 from typing import Optional
 import requests
 from docling_core.types.doc.page import SegmentedPage
 from dotenv import load_dotenv
 from docling.datamodel.base_models import InputFormat
@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
    return options
 #### Using LM Studio with OlmOcr model
 def lms_olmocr_vlm_options(model: str):
    def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
        if page is None:
            return (
                "Below is the image of one page of a document. Just return the plain text"
                " representation of this document as if you were reading it naturally.\n"
                "Do not hallucinate.\n"
            )
        anchor = [
            f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
        ]
        for text_cell in page.textline_cells:
            if not text_cell.text.strip():
                continue
            bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
                page.dimension.height
            )
            anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")
        for image_cell in page.bitmap_resources:
            bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
                page.dimension.height
            )
            anchor.append(
                f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
            )
        if len(anchor) == 1:
            anchor.append(
                f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
            )
        # Original prompt uses cells sorting. We are skipping it in this demo.
        base_text = "\n".join(anchor)
        return (
            f"Below is the image of one page of a document, as well as some raw textual"
            f" content that was previously extracted for it. Just return the plain text"
            f" representation of this document as if you were reading it naturally.\n"
            f"Do not hallucinate.\n"
            f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
        )
    options = ApiVlmOptions(
        url="http://localhost:1234/v1/chat/completions",
        params=dict(
            model=model,
        ),
        prompt=_dynamic_olmocr_prompt,
        timeout=90,
        scale=1.0,
        max_size=1024,  # from OlmOcr pipeline
        response_format=ResponseFormat.MARKDOWN,
    )
    return options
 #### Using Ollama
@ -123,6 +188,12 @@ def main():
    #     format=ResponseFormat.MARKDOWN,
    # )
    # Example using the OlmOcr (dynamic prompt) model with LM Studio:
    # (uncomment the following lines)
    # pipeline_options.vlm_options = lms_olmocr_vlm_options(
    #     model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF",
    # )
    # Example using the Granite Vision model with Ollama:
    # (uncomment the following lines)
    # pipeline_options.vlm_options = ollama_vlm_options(
--- a/docs/installation/index.md
+++ b/docs/installation/index.md
@ -77,7 +77,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
    === "RHEL"
        ```console
-        dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
+        dnf install tesseract tesseract-devel tesseract-langpack-eng tesseract-osd leptonica-devel
        TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
        echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
        ```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "docling"
-version = "2.39.0"  # DO NOT EDIT, updated automatically
+version = "2.40.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 license = "MIT"
 keywords = [
--- a/tests/test_document_picture_classifier.py
+++ b/tests/test_document_picture_classifier.py
@ -17,8 +17,9 @@ def get_converter():
    pipeline_options.do_table_structure = False
    pipeline_options.do_code_enrichment = False
    pipeline_options.do_formula_enrichment = False
    pipeline_options.generate_picture_images = False
    pipeline_options.generate_page_images = False
    pipeline_options.do_picture_classification = True
    pipeline_options.generate_picture_images = True
    pipeline_options.images_scale = 2
    converter = DocumentConverter(
--- a/uv.lock
+++ b/uv.lock
@ -805,7 +805,7 @@ wheels = [
 [[package]]
 name = "docling"
-version = "2.39.0"
+version = "2.40.0"
 source = { editable = "." }
 dependencies = [
    { name = "beautifulsoup4" },