From ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb Mon Sep 17 00:00:00 2001
From: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Date: Fri, 4 Jul 2025 15:36:13 +0200
Subject: [PATCH 1/7] feat: Introduce LayoutOptions to control layout
 postprocessing behaviour (#1870)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py     |  8 ++++++++
 docling/models/layout_model.py            | 10 ++++++++--
 docling/pipeline/standard_pdf_pipeline.py |  1 +
 docling/utils/layout_postprocessor.py     |  9 +++++++--
 4 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 11e085b7..fcf091ef 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -1,4 +1,5 @@
 import logging
+from datetime import datetime
 from enum import Enum
 from pathlib import Path
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@@ -265,6 +266,12 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
     )
 
 
+class LayoutOptions(BaseModel):
+    """Options for layout processing."""
+
+    create_orphan_clusters: bool = True  # Whether to create clusters for orphaned cells
+
+
 class AsrPipelineOptions(PipelineOptions):
     asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
     artifacts_path: Optional[Union[Path, str]] = None
@@ -289,6 +296,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
     picture_description_options: PictureDescriptionBaseOptions = (
         smolvlm_picture_description
     )
+    layout_options: LayoutOptions = LayoutOptions()
 
     images_scale: float = 1.0
     generate_page_images: bool = False
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index da75bb8f..44e7286d 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -12,6 +12,7 @@ from PIL import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import LayoutOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
@@ -48,10 +49,15 @@ class LayoutModel(BasePageModel):
     CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
 
     def __init__(
-        self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
+        self,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        options: LayoutOptions,
     ):
         from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 
+        self.options = options
+
         device = decide_device(accelerator_options.device)
 
         if artifacts_path is None:
@@ -177,7 +183,7 @@ class LayoutModel(BasePageModel):
                     # Apply postprocessing
 
                     processed_clusters, processed_cells = LayoutPostprocessor(
-                        page, clusters
+                        page, clusters, self.options
                     ).postprocess()
                     # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
 
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index ad4f36da..8861174a 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -80,6 +80,7 @@ class StandardPdfPipeline(PaginatedPipeline):
             LayoutModel(
                 artifacts_path=artifacts_path,
                 accelerator_options=pipeline_options.accelerator_options,
+                options=pipeline_options.layout_options,
             ),
             # Table structure model
             TableStructureModel(
diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py
index 3db1cf8d..a98b3aab 100644
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
 from rtree import index
 
 from docling.datamodel.base_models import BoundingBox, Cluster, Page
+from docling.datamodel.pipeline_options import LayoutOptions
 
 _log = logging.getLogger(__name__)
 
@@ -194,12 +195,16 @@ class LayoutPostprocessor:
         DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
     }
 
-    def __init__(self, page: Page, clusters: List[Cluster]) -> None:
+    def __init__(
+        self, page: Page, clusters: List[Cluster], options: LayoutOptions
+    ) -> None:
         """Initialize processor with page and clusters."""
+
         self.cells = page.cells
         self.page = page
         self.page_size = page.size
         self.all_clusters = clusters
+        self.options = options
         self.regular_clusters = [
             c for c in clusters if c.label not in self.SPECIAL_TYPES
         ]
@@ -267,7 +272,7 @@ class LayoutPostprocessor:
 
         # Handle orphaned cells
         unassigned = self._find_unassigned_cells(clusters)
-        if unassigned:
+        if unassigned and self.options.create_orphan_clusters:
             next_id = max((c.id for c in self.all_clusters), default=0) + 1
             orphan_clusters = []
             for i, cell in enumerate(unassigned):

From f4a1c06937f5356758296cd9994ab3a8092bbe01 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 4 Jul 2025 15:31:36 +0000
Subject: [PATCH 2/7] chore: bump version to 2.40.0 [skip ci]

---
 CHANGELOG.md   | 17 ++++++++++++++
 pyproject.toml |  2 +-
 uv.lock        | 60 +++++++++++++++++++++++++-------------------------
 3 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3e04a4dd..d49f5a3b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,20 @@
+## [v2.40.0](https://github.com/docling-project/docling/releases/tag/v2.40.0) - 2025-07-04
+
+### Feature
+
+* Introduce LayoutOptions to control layout postprocessing behaviour ([#1870](https://github.com/docling-project/docling/issues/1870)) ([`ec6cf6f`](https://github.com/docling-project/docling/commit/ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb))
+* Integrate ListItemMarkerProcessor into document assembly ([#1825](https://github.com/docling-project/docling/issues/1825)) ([`56a0e10`](https://github.com/docling-project/docling/commit/56a0e104f76c5ba30ac0fcd247be61f911b560c1))
+
+### Fix
+
+* Secure torch model inits with global locks ([#1884](https://github.com/docling-project/docling/issues/1884)) ([`598c9c5`](https://github.com/docling-project/docling/commit/598c9c53d401de6aac89b7c51bccd57160dace1e))
+* Ensure that TesseractOcrModel does not crash in case OSD is not installed ([#1866](https://github.com/docling-project/docling/issues/1866)) ([`ae39a94`](https://github.com/docling-project/docling/commit/ae39a9411a09b2165ac745af358dea644f868e26))
+
+### Performance
+
+* **msexcel:** _find_table_bounds use iter_rows/iter_cols instead of Worksheet.cell ([#1875](https://github.com/docling-project/docling/issues/1875)) ([`13865c0`](https://github.com/docling-project/docling/commit/13865c06f5c564b9e57f3dbb60d26e60c75258b6))
+* Move expensive imports closer to usage ([#1863](https://github.com/docling-project/docling/issues/1863)) ([`3089cf2`](https://github.com/docling-project/docling/commit/3089cf2d26918eed4007398a528f53971c19f839))
+
 ## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27
 
 ### Feature
diff --git a/pyproject.toml b/pyproject.toml
index 7139c031..1bd0f3d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "docling"
-version = "2.39.0"  # DO NOT EDIT, updated automatically
+version = "2.40.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 license = "MIT"
 keywords = [
diff --git a/uv.lock b/uv.lock
index c98b69b6..b14c8f3d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -805,7 +805,7 @@ wheels = [
 
 [[package]]
 name = "docling"
-version = "2.39.0"
+version = "2.40.0"
 source = { editable = "." }
 dependencies = [
     { name = "beautifulsoup4" },
@@ -3367,7 +3367,7 @@ name = "nvidia-cudnn-cu12"
 version = "9.5.1.17"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "nvidia-cublas-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/78/4535c9c7f859a64781e43c969a3a7e84c54634e319a996d43ef32ce46f83/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2", size = 570988386, upload-time = "2024-10-25T19:54:26.39Z" },
@@ -3378,7 +3378,7 @@ name = "nvidia-cufft-cu12"
 version = "11.3.0.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/16/73727675941ab8e6ffd86ca3a4b7b47065edcca7a997920b831f8147c99d/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5", size = 200221632, upload-time = "2024-11-20T17:41:32.357Z" },
@@ -3407,9 +3407,9 @@ name = "nvidia-cusolver-cu12"
 version = "11.7.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
-    { name = "nvidia-cusparse-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "nvidia-cublas-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "nvidia-cusparse-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f0/6e/c2cf12c9ff8b872e92b4a5740701e51ff17689c4d726fca91875b07f655d/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c", size = 158229790, upload-time = "2024-11-20T17:43:43.211Z" },
@@ -3421,7 +3421,7 @@ name = "nvidia-cusparse-cu12"
 version = "12.5.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/06/1e/b8b7c2f4099a37b96af5c9bb158632ea9e5d9d27d7391d7eb8fc45236674/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7556d9eca156e18184b94947ade0fba5bb47d69cec46bf8660fd2c71a4b48b73", size = 216561367, upload-time = "2024-11-20T17:44:54.824Z" },
@@ -3466,10 +3466,10 @@ name = "ocrmac"
 version = "1.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "pillow" },
-    { name = "pyobjc-framework-vision" },
+    { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux')" },
+    { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'darwin'" },
+    { name = "pillow", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-vision", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/dd/dc/de3e9635774b97d9766f6815bbb3f5ec9bce347115f10d9abbf2733a9316/ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e", size = 1463997, upload-time = "2024-11-07T12:00:00.197Z" }
 wheels = [
@@ -4496,7 +4496,7 @@ name = "pyobjc-framework-cocoa"
 version = "11.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pyobjc-core" },
+    { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/4b/c5/7a866d24bc026f79239b74d05e2cf3088b03263da66d53d1b4cf5207f5ae/pyobjc_framework_cocoa-11.1.tar.gz", hash = "sha256:87df76b9b73e7ca699a828ff112564b59251bb9bbe72e610e670a4dc9940d038", size = 5565335, upload-time = "2025-06-14T20:56:59.683Z" }
 wheels = [
@@ -4515,8 +4515,8 @@ name = "pyobjc-framework-coreml"
 version = "11.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pyobjc-core" },
-    { name = "pyobjc-framework-cocoa" },
+    { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0d/5d/4309f220981d769b1a2f0dcb2c5c104490d31389a8ebea67e5595ce1cb74/pyobjc_framework_coreml-11.1.tar.gz", hash = "sha256:775923eefb9eac2e389c0821b10564372de8057cea89f1ea1cdaf04996c970a7", size = 82005, upload-time = "2025-06-14T20:57:12.004Z" }
 wheels = [
@@ -4535,8 +4535,8 @@ name = "pyobjc-framework-quartz"
 version = "11.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pyobjc-core" },
-    { name = "pyobjc-framework-cocoa" },
+    { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/c7/ac/6308fec6c9ffeda9942fef72724f4094c6df4933560f512e63eac37ebd30/pyobjc_framework_quartz-11.1.tar.gz", hash = "sha256:a57f35ccfc22ad48c87c5932818e583777ff7276605fef6afad0ac0741169f75", size = 3953275, upload-time = "2025-06-14T20:58:17.924Z" }
 wheels = [
@@ -4555,10 +4555,10 @@ name = "pyobjc-framework-vision"
 version = "11.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pyobjc-core" },
-    { name = "pyobjc-framework-cocoa" },
-    { name = "pyobjc-framework-coreml" },
-    { name = "pyobjc-framework-quartz" },
+    { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/40/a8/7128da4d0a0103cabe58910a7233e2f98d18c590b1d36d4b3efaaedba6b9/pyobjc_framework_vision-11.1.tar.gz", hash = "sha256:26590512ee7758da3056499062a344b8a351b178be66d4b719327884dde4216b", size = 133721, upload-time = "2025-06-14T20:58:46.095Z" }
 wheels = [
@@ -5038,17 +5038,17 @@ version = "1.4.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
     { name = "onnxruntime", version = "1.19.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "opencv-python" },
-    { name = "pillow" },
-    { name = "pyclipper" },
-    { name = "pyyaml" },
+    { name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
+    { name = "opencv-python", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "pillow", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "pyclipper", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "pyyaml", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
     { name = "shapely", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "six" },
-    { name = "tqdm" },
+    { name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
+    { name = "six", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "tqdm", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/ba/12/1e5497183bdbe782dbb91bad1d0d2297dba4d2831b2652657f7517bfc6df/rapidocr_onnxruntime-1.4.4-py3-none-any.whl", hash = "sha256:971d7d5f223a7a808662229df1ef69893809d8457d834e6373d3854bc1782cbf", size = 14915192, upload-time = "2025-01-17T01:48:25.104Z" },
@@ -6344,7 +6344,7 @@ name = "triton"
 version = "3.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "setuptools", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "setuptools", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8d/a9/549e51e9b1b2c9b854fd761a1d23df0ba2fbc60bd0c13b489ffa518cfcb7/triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b74db445b1c562844d3cfad6e9679c72e93fdfb1a90a24052b03bb5c49d1242e", size = 155600257, upload-time = "2025-05-29T23:39:36.085Z" },

From dd8fde7f19ecd9695d6bc6cf94896a2cf87a0e7c Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Mon, 7 Jul 2025 08:59:14 +0200
Subject: [PATCH 3/7] fix: typo in asr options (#1902)

fix typo

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/datamodel/asr_model_specs.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docling/datamodel/asr_model_specs.py b/docling/datamodel/asr_model_specs.py
index 95287ad2..426b5851 100644
--- a/docling/datamodel/asr_model_specs.py
+++ b/docling/datamodel/asr_model_specs.py
@@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )
@@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )
@@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )
@@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )
@@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )
@@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )

From edd4356aac25b62c30cae6d2e8c69095f63bd442 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Mon, 7 Jul 2025 16:23:16 +0200
Subject: [PATCH 4/7] fix: use only backend for picture classifier (#1904)

use backend for picture classifier

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/models/document_picture_classifier.py | 25 +++++++++----------
 docling/pipeline/standard_pdf_pipeline.py     |  1 +
 tests/test_document_picture_classifier.py     |  3 ++-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py
index 73a30203..24e45078 100644
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@@ -14,7 +14,8 @@ from PIL import Image
 from pydantic import BaseModel
 
 from docling.datamodel.accelerator_options import AcceleratorOptions
-from docling.models.base_model import BaseEnrichmentModel
+from docling.datamodel.base_models import ItemAndImageEnrichmentElement
+from docling.models.base_model import BaseItemAndImageEnrichmentModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 
@@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
     kind: Literal["document_picture_classifier"] = "document_picture_classifier"
 
 
-class DocumentPictureClassifier(BaseEnrichmentModel):
+class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
     """
     A model for classifying pictures in documents.
 
@@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
     def __call__(
         self,
         doc: DoclingDocument,
-        element_batch: Iterable[NodeItem],
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
     ) -> Iterable[NodeItem]:
         """
         Processes a batch of elements and enriches them with classification predictions.
@@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
         ----------
         doc : DoclingDocument
             The document containing the elements to be processed.
-        element_batch : Iterable[NodeItem]
+        element_batch : Iterable[ItemAndImageEnrichmentElement]
             A batch of pictures to classify.
 
         Returns
@@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
         """
         if not self.enabled:
             for element in element_batch:
-                yield element
+                yield element.item
             return
 
         images: List[Union[Image.Image, np.ndarray]] = []
         elements: List[PictureItem] = []
         for el in element_batch:
-            assert isinstance(el, PictureItem)
-            elements.append(el)
-            img = el.get_image(doc)
-            assert img is not None
-            images.append(img)
+            assert isinstance(el.item, PictureItem)
+            elements.append(el.item)
+            images.append(el.image)
 
         outputs = self.document_picture_classifier.predict(images)
 
-        for element, output in zip(elements, outputs):
-            element.annotations.append(
+        for item, output in zip(elements, outputs):
+            item.annotations.append(
                 PictureClassificationData(
                     provenance="DocumentPictureClassifier",
                     predicted_classes=[
@@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
                 )
             )
 
-            yield element
+            yield item
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index 8861174a..de76ef24 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -129,6 +129,7 @@ class StandardPdfPipeline(PaginatedPipeline):
         if (
             self.pipeline_options.do_formula_enrichment
             or self.pipeline_options.do_code_enrichment
+            or self.pipeline_options.do_picture_classification
             or self.pipeline_options.do_picture_description
         ):
             self.keep_backend = True
diff --git a/tests/test_document_picture_classifier.py b/tests/test_document_picture_classifier.py
index 5dc5e926..3a43a61a 100644
--- a/tests/test_document_picture_classifier.py
+++ b/tests/test_document_picture_classifier.py
@@ -17,8 +17,9 @@ def get_converter():
     pipeline_options.do_table_structure = False
     pipeline_options.do_code_enrichment = False
     pipeline_options.do_formula_enrichment = False
+    pipeline_options.generate_picture_images = False
+    pipeline_options.generate_page_images = False
     pipeline_options.do_picture_classification = True
-    pipeline_options.generate_picture_images = True
     pipeline_options.images_scale = 2
 
     converter = DocumentConverter(

From b8813eea806a33f3bcc4f865d7e6ceba8b2fffa5 Mon Sep 17 00:00:00 2001
From: Shkarupa Alex <shkarupa.alex@gmail.com>
Date: Mon, 7 Jul 2025 17:58:42 +0300
Subject: [PATCH 5/7] feat(vlm): Dynamic prompts (#1808)

* Unify temperature options for Vlm models

* Dynamic prompt support with example

* DCO Remediation Commit for Shkarupa Alex <shkarupa.alex@gmail.com>

I, Shkarupa Alex <shkarupa.alex@gmail.com>, hereby add my Signed-off-by to this commit: 34d446cb9829835cf6b8f8fdb4abd9fef3455c3a
I, Shkarupa Alex <shkarupa.alex@gmail.com>, hereby add my Signed-off-by to this commit: 9c595d574fce5e3e139f5af780f8223496735ff1

Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com>

* Replace Page with SegmentedPage

* Fix example HF repo link

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>

* Sign-off

Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com>

* DCO Remediation Commit for Shkarupa Alex <shkarupa.alex@gmail.com>

I, Shkarupa Alex <shkarupa.alex@gmail.com>, hereby add my Signed-off-by to this commit: 1a162066dd3e4ee240d272d9d503d549a0856590

Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com>

Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com>

* Use lmstudio-community model

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>

* Swap inference engine to LM Studio

Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com>

---------

Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com>
Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
---
 .../datamodel/pipeline_options_vlm_model.py   |  7 +-
 docling/models/api_vlm_model.py               | 12 ++--
 .../hf_transformers_model.py                  | 12 ++--
 docling/models/vlm_models_inline/mlx_model.py |  8 ++-
 docling/pipeline/vlm_pipeline.py              |  1 +
 docs/examples/vlm_pipeline_api_model.py       | 71 +++++++++++++++++++
 6 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
index 90ab6685..fd672b1b 100644
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -1,6 +1,7 @@
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 
+from docling_core.types.doc.page import SegmentedPage
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 
@@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
 
 class BaseVlmOptions(BaseModel):
     kind: str
-    prompt: str
+    prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
     scale: float = 2.0
     max_size: Optional[int] = None
+    temperature: float = 0.0
 
 
 class ResponseFormat(str, Enum):
@@ -51,7 +53,6 @@ class InlineVlmOptions(BaseVlmOptions):
         AcceleratorDevice.MPS,
     ]
 
-    temperature: float = 0.0
     stop_strings: List[str] = []
     extra_generation_config: Dict[str, Any] = {}
 
diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py
index bfd00003..164ac285 100644
--- a/docling/models/api_vlm_model.py
+++ b/docling/models/api_vlm_model.py
@@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
 
             self.timeout = self.vlm_options.timeout
             self.concurrency = self.vlm_options.concurrency
-            self.prompt_content = (
-                f"This is a page from a document.\n{self.vlm_options.prompt}"
-            )
             self.params = {
                 **self.vlm_options.params,
-                "temperature": 0,
+                "temperature": self.vlm_options.temperature,
             }
 
     def __call__(
@@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
 
+                    if callable(self.vlm_options.prompt):
+                        prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        prompt = self.vlm_options.prompt
+
                     page_tags = api_image_request(
                         image=hi_res_image,
-                        prompt=self.prompt_content,
+                        prompt=prompt,
                         url=self.vlm_options.url,
                         timeout=self.timeout,
                         headers=self.vlm_options.headers,
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
index bd35888d..4e2d80b8 100644
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -128,7 +128,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                     )
 
                     # Define prompt structure
-                    prompt = self.formulate_prompt()
+                    if callable(self.vlm_options.prompt):
+                        user_prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        user_prompt = self.vlm_options.prompt
+                    prompt = self.formulate_prompt(user_prompt)
 
                     inputs = self.processor(
                         text=prompt, images=[hi_res_image], return_tensors="pt"
@@ -162,7 +166,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
 
                 yield page
 
-    def formulate_prompt(self) -> str:
+    def formulate_prompt(self, user_prompt: str) -> str:
         """Formulate a prompt for the VLM."""
 
         if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
@@ -173,7 +177,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
             assistant_prompt = "<|assistant|>"
             prompt_suffix = "<|end|>"
 
-            prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
+            prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
             _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
 
             return prompt
@@ -187,7 +191,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                         "text": "This is a page from a document.",
                     },
                     {"type": "image"},
-                    {"type": "text", "text": self.vlm_options.prompt},
+                    {"type": "text", "text": user_prompt},
                 ],
             }
         ]
diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py
index 58f037fc..647ce531 100644
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
             elif (artifacts_path / repo_cache_folder).exists():
                 artifacts_path = artifacts_path / repo_cache_folder
 
-            self.param_question = vlm_options.prompt
-
             ## Load the model
             self.vlm_model, self.processor = load(artifacts_path)
             self.config = load_config(artifacts_path)
@@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
 
+                    if callable(self.vlm_options.prompt):
+                        user_prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        user_prompt = self.vlm_options.prompt
                     prompt = self.apply_chat_template(
-                        self.processor, self.config, self.param_question, num_images=1
+                        self.processor, self.config, user_prompt, num_images=1
                     )
 
                     start_time = time.time()
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 2ecfe55a..ab474fab 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
             page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
             if page._backend is not None and page._backend.is_valid():
                 page.size = page._backend.get_size()
+                page.parsed_page = page._backend.get_segmented_page()
 
         return page
 
diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py
index 679f7bd7..a809b926 100644
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@@ -1,8 +1,10 @@
 import logging
 import os
 from pathlib import Path
+from typing import Optional
 
 import requests
+from docling_core.types.doc.page import SegmentedPage
 from dotenv import load_dotenv
 
 from docling.datamodel.base_models import InputFormat
@@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
     return options
 
 
+#### Using LM Studio with OlmOcr model
+
+
+def lms_olmocr_vlm_options(model: str):
+    def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
+        if page is None:
+            return (
+                "Below is the image of one page of a document. Just return the plain text"
+                " representation of this document as if you were reading it naturally.\n"
+                "Do not hallucinate.\n"
+            )
+
+        anchor = [
+            f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
+        ]
+
+        for text_cell in page.textline_cells:
+            if not text_cell.text.strip():
+                continue
+            bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
+                page.dimension.height
+            )
+            anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")
+
+        for image_cell in page.bitmap_resources:
+            bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
+                page.dimension.height
+            )
+            anchor.append(
+                f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
+            )
+
+        if len(anchor) == 1:
+            anchor.append(
+                f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
+            )
+
+        # Original prompt uses cells sorting. We are skipping it in this demo.
+
+        base_text = "\n".join(anchor)
+
+        return (
+            f"Below is the image of one page of a document, as well as some raw textual"
+            f" content that was previously extracted for it. Just return the plain text"
+            f" representation of this document as if you were reading it naturally.\n"
+            f"Do not hallucinate.\n"
+            f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
+        )
+
+    options = ApiVlmOptions(
+        url="http://localhost:1234/v1/chat/completions",
+        params=dict(
+            model=model,
+        ),
+        prompt=_dynamic_olmocr_prompt,
+        timeout=90,
+        scale=1.0,
+        max_size=1024,  # from OlmOcr pipeline
+        response_format=ResponseFormat.MARKDOWN,
+    )
+    return options
+
+
 #### Using Ollama
 
 
@@ -123,6 +188,12 @@ def main():
     #     format=ResponseFormat.MARKDOWN,
     # )
 
+    # Example using the OlmOcr (dynamic prompt) model with LM Studio:
+    # (uncomment the following lines)
+    # pipeline_options.vlm_options = lms_olmocr_vlm_options(
+    #     model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF",
+    # )
+
     # Example using the Granite Vision model with Ollama:
     # (uncomment the following lines)
     # pipeline_options.vlm_options = ollama_vlm_options(

From e25873d55766761741ad5781efd18bc3bfea5e3d Mon Sep 17 00:00:00 2001
From: VIktor Kuropiantnyk <103574791+vku-ibm@users.noreply.github.com>
Date: Mon, 7 Jul 2025 17:06:26 +0200
Subject: [PATCH 6/7] fix: docs are missing osd packages for tesseract on RHEL
 (#1905)

Fixed missing packages in the docs on tesseract

Signed-off-by: Viktor Kuropiatnyk <vku@zurich.ibm.com>
---
 docs/installation/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/installation/index.md b/docs/installation/index.md
index 5930525c..38fba4c8 100644
--- a/docs/installation/index.md
+++ b/docs/installation/index.md
@@ -77,7 +77,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
     === "RHEL"
 
         ```console
-        dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
+        dnf install tesseract tesseract-devel tesseract-langpack-eng tesseract-osd leptonica-devel
         TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
         echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
         ```

From a07ba863c4c3dacfecaca159faa5653097662755 Mon Sep 17 00:00:00 2001
From: geoHeil <1694964+geoHeil@users.noreply.github.com>
Date: Tue, 8 Jul 2025 05:54:57 +0200
Subject: [PATCH 7/7] feat: add image-text-to-text models in transformers
 (#1772)

* feat(dolphin): add dolphin support

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* rename

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* reformat

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* fix mypy

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* add prompt style and examples

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
---
 .../datamodel/pipeline_options_vlm_model.py   |  7 +++
 .../hf_transformers_model.py                  | 49 ++++++++++++-------
 docs/examples/compare_vlm_models.py           | 39 ++++++++++++++-
 3 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
index fd672b1b..bcea2493 100644
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -31,6 +31,12 @@ class TransformersModelType(str, Enum):
     AUTOMODEL = "automodel"
     AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
     AUTOMODEL_CAUSALLM = "automodel-causallm"
+    AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
+
+
+class TransformersPromptStyle(str, Enum):
+    CHAT = "chat"
+    RAW = "raw"
 
 
 class InlineVlmOptions(BaseVlmOptions):
@@ -44,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
 
     inference_framework: InferenceFramework
     transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
+    transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
     response_format: ResponseFormat
 
     torch_dtype: Optional[str] = None
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
index 4e2d80b8..d84925dd 100644
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import (
     InlineVlmOptions,
     TransformersModelType,
+    TransformersPromptStyle,
 )
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import (
@@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
             from transformers import (
                 AutoModel,
                 AutoModelForCausalLM,
+                AutoModelForImageTextToText,
                 AutoModelForVision2Seq,
                 AutoProcessor,
                 BitsAndBytesConfig,
@@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                 == TransformersModelType.AUTOMODEL_VISION2SEQ
             ):
                 model_cls = AutoModelForVision2Seq
+            elif (
+                self.vlm_options.transformers_model_type
+                == TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
+            ):
+                model_cls = AutoModelForImageTextToText
 
             self.processor = AutoProcessor.from_pretrained(
                 artifacts_path,
@@ -169,7 +176,10 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
     def formulate_prompt(self, user_prompt: str) -> str:
         """Formulate a prompt for the VLM."""
 
-        if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
+        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
+            return user_prompt
+
+        elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
             _log.debug("Using specialized prompt for Phi-4")
             # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
 
@@ -182,20 +192,25 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
 
             return prompt
 
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "This is a page from a document.",
-                    },
-                    {"type": "image"},
-                    {"type": "text", "text": user_prompt},
-                ],
-            }
-        ]
-        prompt = self.processor.apply_chat_template(
-            messages, add_generation_prompt=False
+        elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "This is a page from a document.",
+                        },
+                        {"type": "image"},
+                        {"type": "text", "text": user_prompt},
+                    ],
+                }
+            ]
+            prompt = self.processor.apply_chat_template(
+                messages, add_generation_prompt=False
+            )
+            return prompt
+
+        raise RuntimeError(
+            f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
         )
-        return prompt
diff --git a/docs/examples/compare_vlm_models.py b/docs/examples/compare_vlm_models.py
index f9bd2dcd..49c34387 100644
--- a/docs/examples/compare_vlm_models.py
+++ b/docs/examples/compare_vlm_models.py
@@ -14,11 +14,18 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
 from tabulate import tabulate
 
 from docling.datamodel import vlm_model_specs
+from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
     VlmPipelineOptions,
 )
-from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
+from docling.datamodel.pipeline_options_vlm_model import (
+    InferenceFramework,
+    InlineVlmOptions,
+    ResponseFormat,
+    TransformersModelType,
+    TransformersPromptStyle,
+)
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
 
@@ -101,6 +108,33 @@ if __name__ == "__main__":
     out_path = Path("scratch")
     out_path.mkdir(parents=True, exist_ok=True)
 
+    ## Definiton of more inline models
+    llava_qwen = InlineVlmOptions(
+        repo_id="llava-hf/llava-interleave-qwen-0.5b-hf",
+        # prompt="Read text in the image.",
+        prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+        # prompt="Parse the reading order of this document.",
+        response_format=ResponseFormat.MARKDOWN,
+        inference_framework=InferenceFramework.TRANSFORMERS,
+        transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+        supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
+        scale=2.0,
+        temperature=0.0,
+    )
+
+    # Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt.
+    dolphin_oneshot = InlineVlmOptions(
+        repo_id="ByteDance/Dolphin",
+        prompt="<s>Read text in the image. <Answer/>",
+        response_format=ResponseFormat.MARKDOWN,
+        inference_framework=InferenceFramework.TRANSFORMERS,
+        transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+        transformers_prompt_style=TransformersPromptStyle.RAW,
+        supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
+        scale=2.0,
+        temperature=0.0,
+    )
+
     ## Use VlmPipeline
     pipeline_options = VlmPipelineOptions()
     pipeline_options.generate_page_images = True
@@ -121,6 +155,9 @@ if __name__ == "__main__":
         vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
         vlm_model_specs.PHI4_TRANSFORMERS,
         vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
+        ## More inline models
+        dolphin_oneshot,
+        llava_qwen,
     ]
 
     # Remove MLX models if not on Mac