Merge branch 'docling-project:main' into fix/docx_text_box_extraction

2025-07-25 19:44:34 +00:00 · 2025-06-03 22:00:12 +08:00 · 2025-06-03 22:00:12 +08:00 · bef2fa95cd
commit bef2fa95cd
parent 744b2a06b5 96c54dba91
65 changed files with 8631 additions and 9729 deletions
--- a/.github/actions/setup-poetry/action.yml
+++ b/.github/actions/setup-poetry/action.yml
@ -1,19 +0,0 @@
-name: 'Set up Poetry and install'
-description: 'Set up a specific version of Poetry and install dependencies using caching.'
-inputs:
-  python-version:
-    description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax."
-    default: '3.11'
-runs:
-  using: 'composite'
-  steps:
-    - name: Install poetry
-      run: pipx install poetry==1.8.5
-      shell: bash
-    - uses: actions/setup-python@v5
-      with:
-        python-version: ${{ inputs.python-version }}
-        cache: 'poetry'
-    - name: Install dependencies
-      run: poetry install --all-extras
-      shell: bash
--- a/.github/scripts/release.sh
+++ b/.github/scripts/release.sh
@ -10,11 +10,12 @@ fi
 CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"

 # update package version
-poetry version "${TARGET_VERSION}"
+uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}"
+UV_FROZEN=0 uv lock --upgrade-package docling

 # collect release notes
 REL_NOTES=$(mktemp)
-poetry run semantic-release changelog --unreleased >> "${REL_NOTES}"
+uv run --no-sync semantic-release changelog --unreleased >> "${REL_NOTES}"

 # update changelog
 TMP_CHGLOG=$(mktemp)
@ -30,7 +31,7 @@ mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"
 # push changes
 git config --global user.name 'github-actions[bot]'
 git config --global user.email 'github-actions[bot]@users.noreply.github.com'
-git add pyproject.toml "${CHGLOG_FILE}"
+git add pyproject.toml uv.lock "${CHGLOG_FILE}"
 COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
 git commit -m "${COMMIT_MSG}"
 git push origin main
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@ -4,9 +4,8 @@ on:
  workflow_dispatch:

 env:
-  # disable keyring (https://github.com/actions/runner-images/issues/6185):
-  PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
-
+  UV_FROZEN: "1"
+    
 jobs:
  code-checks:
    uses: ./.github/workflows/checks.yml
@ -20,15 +19,20 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0  # for fetching tags, required for semantic-release
-      - uses: ./.github/actions/setup-poetry
+      - name: Install uv and set the python version
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+      - name: Install dependencies
+        run: uv sync --only-dev
      - name: Check version of potential release
        id: version_check
        run: |
-            TRGT_VERSION=$(poetry run semantic-release print-version)
-            echo "TRGT_VERSION=${TRGT_VERSION}" >> $GITHUB_OUTPUT
-            echo "${TRGT_VERSION}"
+          TRGT_VERSION=$(uv run --no-sync semantic-release print-version)
+          echo "TRGT_VERSION=${TRGT_VERSION}" >> "$GITHUB_OUTPUT"
+          echo "${TRGT_VERSION}"
      - name: Check notes of potential release
-        run: poetry run semantic-release changelog --unreleased
+        run: uv run --no-sync semantic-release changelog --unreleased
  release:
    needs: [code-checks, pre-release-check]
    if: needs.pre-release-check.outputs.TARGET_TAG_V != ''
@ -45,7 +49,12 @@ jobs:
        with:
          token: ${{ steps.app-token.outputs.token }}
          fetch-depth: 0  # for fetching tags, required for semantic-release
-      - uses: ./.github/actions/setup-poetry
+      - name: Install uv and set the python version
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+      - name: Install dependencies
+        run: uv sync --only-dev
      - name: Run release script
        env:
          GH_TOKEN: ${{ steps.app-token.outputs.token }}
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@ -12,6 +12,7 @@ on:
 env:
  HF_HUB_DOWNLOAD_TIMEOUT: "60"
  HF_HUB_ETAG_TIMEOUT: "60"
+  UV_FROZEN: "1"

 jobs:
  run-checks:
@ -31,16 +32,24 @@ jobs:
        with:
          path: ~/.cache/huggingface
          key: huggingface-cache-py${{ matrix.python-version }}
-      - uses: ./.github/actions/setup-poetry
+      - name: Install uv and set the python version
+        uses: astral-sh/setup-uv@v5
        with:
          python-version: ${{ matrix.python-version }}
-      - name: Run styling check
-        run: poetry run pre-commit run --all-files
-      - name: Install with poetry
-        run: poetry install --all-extras
+          enable-cache: true
+      - name: pre-commit cache key
+        run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV"
+      - uses: actions/cache@v4
+        with:
+          path: ~/.cache/pre-commit
+          key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
+      - name: Install dependencies
+        run: uv sync --frozen --all-extras
+      - name: Check style and run tests
+        run: pre-commit run --all-files
      - name: Testing
        run: |
-          poetry run pytest -v --cov=docling --cov-report=xml tests
+          uv run --no-sync pytest -v --cov=docling --cov-report=xml tests
      - name: Upload coverage to Codecov
        if: inputs.push_coverage
        uses: codecov/codecov-action@v5
@ -51,13 +60,58 @@ jobs:
        run: |
          for file in docs/examples/*.py; do
            # Skip batch_convert.py
-            if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
+            if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
                echo "Skipping $file"
                continue
            fi

            echo "Running example $file"
-            poetry run python "$file" || exit 1
+            uv run --no-sync python "$file" || exit 1
          done
-      - name: Build with poetry
-        run: poetry build
+
+  build-package:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.12']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv and set the python version
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          enable-cache: true
+      - name: Install dependencies
+        run: uv sync --all-extras
+      - name: Build package
+        run: uv build
+      - name: Check content of wheel
+        run: unzip -l dist/*.whl
+      - name: Store the distribution packages
+        uses: actions/upload-artifact@v4
+        with:
+          name: python-package-distributions
+          path: dist/
+
+  test-package:
+    needs:
+      - build-package
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.12']
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v4
+        with:
+          name: python-package-distributions
+          path: dist/
+      - name: Install uv and set the python version
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          enable-cache: true
+      - name: Install package
+        run: uv pip install dist/*.whl
+      - name: Run docling
+        run: docling --help
--- a/.github/workflows/ci-docs.yml
+++ b/.github/workflows/ci-docs.yml
@ -8,6 +8,9 @@ on:
      - "**"
      - "!gh-pages"

+env:
+  UV_FROZEN: "1"
+
 jobs:
  build-docs:
    if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -9,10 +9,6 @@ on:
      - "!main"
      - "!gh-pages"

-env:
-  # disable keyring (https://github.com/actions/runner-images/issues/6185):
-  PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
-
 jobs:
  code-checks:
    if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@ -6,14 +6,21 @@ on:
                description: "If true, the docs will be deployed."
                default: false

+env:
+  UV_FROZEN: "1"
+
 jobs:
    run-docs:
        runs-on: ubuntu-latest
        steps:
        - uses: actions/checkout@v4
-        - uses: ./.github/actions/setup-poetry
+        - name: Install uv and set the python version
+          uses: astral-sh/setup-uv@v5
+          with:
+            python-version: ${{ matrix.python-version }}
+            enable-cache: true
        - name: Build docs
-          run: poetry run mkdocs build --verbose --clean
+          run: uv run mkdocs build --verbose --clean
        - name: Build and push docs
          if: inputs.deploy
-          run: poetry run mkdocs gh-deploy --force
+          run: uv run --no-sync mkdocs gh-deploy --force
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@ -4,16 +4,18 @@ on:
  release:
    types: [published]

+env:
+  UV_FROZEN: "1"
+
 permissions:
  contents: read

-env:
-  # disable keyring (https://github.com/actions/runner-images/issues/6185):
-  PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
-
 jobs:
  build-and-publish:
    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.12']
    environment:
      name: pypi
      url: https://pypi.org/p/docling
@ -21,9 +23,15 @@ jobs:
      id-token: write  # IMPORTANT: mandatory for trusted publishing
    steps:
      - uses: actions/checkout@v4
-      - uses: ./.github/actions/setup-poetry
-      - name: Build and publish
-        run: poetry build
+      - name: Install uv and set the python version
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          enable-cache: true
+      - name: Install dependencies
+        run: uv sync --all-extras
+      - name: Build package
+        run: uv build
      - name: Publish distribution 📦 to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -17,12 +17,11 @@ repos:
    hooks:
      - id: mypy
        name: MyPy
-        entry: poetry run mypy docling
+        entry: uv run --no-sync mypy docling
        pass_filenames: false
        language: system
        files: '\.py$'
-      - id: poetry
-        name: Poetry check
-        entry: poetry check --lock
-        pass_filenames: false
-        language: system
+  - repo: https://github.com/astral-sh/uv-pre-commit
+    rev: 0.7.8
+    hooks:
+      - id: uv-lock
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,25 @@
+## [v2.36.0](https://github.com/docling-project/docling/releases/tag/v2.36.0) - 2025-06-03
+
+### Feature
+
+* Simplify dependencies, switch to uv ([#1700](https://github.com/docling-project/docling/issues/1700)) ([`cdd4018`](https://github.com/docling-project/docling/commit/cdd401847a35f16d69944eb6dddf57e4e0b65020))
+* New vlm-models support ([#1570](https://github.com/docling-project/docling/issues/1570)) ([`cfdf4ce`](https://github.com/docling-project/docling/commit/cfdf4cea25e681fc557df310b8bf34f3dd892e15))
+
+## [v2.35.0](https://github.com/docling-project/docling/releases/tag/v2.35.0) - 2025-06-02
+
+### Feature
+
+* Add visualization of bbox on page with html export. ([#1663](https://github.com/docling-project/docling/issues/1663)) ([`b356b33`](https://github.com/docling-project/docling/commit/b356b33059bdeeaf1584d9d189cbf1c4832e367c))
+
+### Fix
+
+* Guess HTML content starting with script tag ([#1673](https://github.com/docling-project/docling/issues/1673)) ([`984cb13`](https://github.com/docling-project/docling/commit/984cb137f6a8ae2f3a63623add6c474d97ef8739))
+* UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte ([#1665](https://github.com/docling-project/docling/issues/1665)) ([`51d3450`](https://github.com/docling-project/docling/commit/51d34509156e2dbec9e697276681d59f9ca7e020))
+
+### Documentation
+
+* Fix typo in index.md ([#1676](https://github.com/docling-project/docling/issues/1676)) ([`11ca4f7`](https://github.com/docling-project/docling/commit/11ca4f7a7bd8068bee472510dd71f1cd58f86f17))
+
 ## [v2.34.0](https://github.com/docling-project/docling/releases/tag/v2.34.0) - 2025-05-22

 ### Feature
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -6,70 +6,52 @@ For more details on the contributing guidelines head to the Docling Project [com

 ## Developing

-### Usage of Poetry
+### Usage of uv

-We use Poetry to manage dependencies.
+We use [uv](https://docs.astral.sh/uv/) as package and project manager.

 #### Installation

-To install Poetry, follow the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
+To install `uv`, check the documentation on [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).

-1. Install Poetry globally on your machine:
-    ```bash
-    curl -sSL https://install.python-poetry.org | python3 -
-    ```
-    The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
+#### Create an environment and sync it

-2. Make sure Poetry is in your `$PATH`:
-    - for `zsh`:
-        ```sh
-        echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
-        ```
-    - for `bash`:
-        ```sh
-        echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
-        ```
-
-3. The official guidelines linked above include useful details on configuring autocomplete for most shell environments, e.g., Bash and Zsh.
-
-#### Create a Virtual Environment and Install Dependencies
-
-To activate the Virtual Environment, run:
+You can use the `uv sync` to create a project virtual environment (if it does not already exist) and sync
+the project's dependencies with the environment.

 ```bash
-poetry shell
+uv sync
 ```

-This will spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
+#### Use a specific Python version (optional)
+
+If you need to work with a specific version of Python, you can create a new virtual environment for that version
+and run the sync command:

 ```bash
-poetry install
+uv venv --python 3.12
+uv sync
 ```

-**(Advanced) Use a Specific Python Version**
+More detailed options are described on the [Using Python environments](https://docs.astral.sh/uv/pip/environments/) documentation.

-If you need to work with a specific (older) version of Python, run:
+#### Add a new dependency
+
+Simply use the `uv add` command. The `pyproject.toml` and `uv.lock` files will be updated.

 ```bash
-poetry env use $(which python3.8)
-```
-
-This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` with the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
-
-#### Add a New Dependency
-
-```bash
-poetry add NAME
+uv add [OPTIONS] <PACKAGES|--requirements <REQUIREMENTS>>
 ```

 ## Coding Style Guidelines

 We use the following tools to enforce code style:

- iSort, to sort imports
- Black, to format code
+- [Ruff](https://docs.astral.sh/ruff/), as linter and code formatter
+- [MyPy](https://mypy.readthedocs.io), as static type checker

-We run a series of checks on the codebase on every commit using `pre-commit`. To install the hooks, run:
+A set of styling checks, as well as regression tests, are defined and managed through the [pre-commit](https://pre-commit.com/) framework.
+To ensure that those scripts run automatically before a commit is finalized, install `pre-commit` on your local repository:

 ```bash
 pre-commit install
@ -81,7 +63,7 @@ To run the checks on-demand, run:
 pre-commit run --all-files
 ```

-Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by its hooks. In these cases, `git add` the modified files and `git commit` again.
+Note: Checks like `Ruff` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by its hooks. In these cases, `git add` the modified files and `git commit` again.

 ## Tests

@ -94,7 +76,7 @@ When a change improves the conversion results, multiple reference documents must
 The reference data can be regenerated with

 ```sh
-DOCLING_GEN_TEST_DATA=1 poetry run pytest
+DOCLING_GEN_TEST_DATA=1 uv run pytest
 ```

 All PRs modifying the reference test data require a double review to guarantee we don't miss edge cases.
--- a/README.md
+++ b/README.md
@ -14,9 +14,8 @@
 [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://docling-project.github.io/docling/)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
-[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
-[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
+[![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
+[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
@ -36,7 +35,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 🔒 Local execution capabilities for sensitive data and air-gapped environments
 * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
+* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
 * 💻 Simple and convenient CLI

 ### Coming soon
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -28,6 +28,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
@ -36,8 +37,6 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
    EasyOcrOptions,
    OcrOptions,
    PaginatedPipelineOptions,
@ -45,14 +44,16 @@ from docling.datamodel.pipeline_options import (
    PdfPipeline,
    PdfPipelineOptions,
    TableFormerMode,
-    VlmModelType,
    VlmPipelineOptions,
-    granite_vision_vlm_conversion_options,
-    granite_vision_vlm_ollama_conversion_options,
-    smoldocling_vlm_conversion_options,
-    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
+from docling.datamodel.vlm_model_specs import (
+    GRANITE_VISION_OLLAMA,
+    GRANITE_VISION_TRANSFORMERS,
+    SMOLDOCLING_MLX,
+    SMOLDOCLING_TRANSFORMERS,
+    VlmModelType,
+)
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
 from docling.pipeline.vlm_pipeline import VlmPipeline
@ -579,20 +580,16 @@ def convert(  # noqa: C901
            )

            if vlm_model == VlmModelType.GRANITE_VISION:
-                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
-                pipeline_options.vlm_options = (
-                    granite_vision_vlm_ollama_conversion_options
-                )
+                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
            elif vlm_model == VlmModelType.SMOLDOCLING:
-                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                if sys.platform == "darwin":
                    try:
                        import mlx_vlm

-                        pipeline_options.vlm_options = (
-                            smoldocling_vlm_mlx_conversion_options
-                        )
+                        pipeline_options.vlm_options = SMOLDOCLING_MLX
                    except ImportError:
                        _log.warning(
                            "To run SmolDocling faster, please install mlx-vlm:\n"
--- a/docling/datamodel/accelerator_options.py
+++ b/docling/datamodel/accelerator_options.py
@ -0,0 +1,68 @@
+import logging
+import os
+import re
+from enum import Enum
+from typing import Any, Union
+
+from pydantic import field_validator, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+_log = logging.getLogger(__name__)
+
+
+class AcceleratorDevice(str, Enum):
+    """Devices to run model inference"""
+
+    AUTO = "auto"
+    CPU = "cpu"
+    CUDA = "cuda"
+    MPS = "mps"
+
+
+class AcceleratorOptions(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
+    )
+
+    num_threads: int = 4
+    device: Union[str, AcceleratorDevice] = "auto"
+    cuda_use_flash_attention2: bool = False
+
+    @field_validator("device")
+    def validate_device(cls, value):
+        # "auto", "cpu", "cuda", "mps", or "cuda:N"
+        if value in {d.value for d in AcceleratorDevice} or re.match(
+            r"^cuda(:\d+)?$", value
+        ):
+            return value
+        raise ValueError(
+            "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
+        )
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_alternative_envvars(cls, data: Any) -> Any:
+        r"""
+        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
+        The alternative envvar is used only if it is valid and the regular envvar is not set.
+
+        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
+        the same functionality. In case the alias envvar is set and the user tries to override the
+        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
+        as an extra input instead of simply overwriting the evvar value for that parameter.
+        """
+        if isinstance(data, dict):
+            input_num_threads = data.get("num_threads")
+            # Check if to set the num_threads from the alternative envvar
+            if input_num_threads is None:
+                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
+                omp_num_threads = os.getenv("OMP_NUM_THREADS")
+                if docling_num_threads is None and omp_num_threads is not None:
+                    try:
+                        data["num_threads"] = int(omp_num_threads)
+                    except ValueError:
+                        _log.error(
+                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
+                            omp_num_threads,
+                        )
+        return data
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -13,11 +13,11 @@ from docling_core.types.doc import (
    TableCell,
 )
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
-
-# DO NOT REMOVE; explicitly exposed from this location
 from docling_core.types.io import (
    DocumentStream,
 )
+
+# DO NOT REMOVE; explicitly exposed from this location
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict, Field, computed_field

@ -131,12 +131,6 @@ class ErrorItem(BaseModel):
    error_message: str


-# class Cell(BaseModel):
-#    id: int
-#    text: str
-#    bbox: BoundingBox
-
-
 class Cluster(BaseModel):
    id: int
    label: DocItemLabel
@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel):
    clusters: List[Cluster] = []


+class VlmPredictionToken(BaseModel):
+    text: str = ""
+    token: int = -1
+    logprob: float = -1
+
+
 class VlmPrediction(BaseModel):
    text: str = ""
+    generated_tokens: list[VlmPredictionToken] = []
+    generation_time: float = -1


 class ContainerElement(
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -1,6 +1,4 @@
 import logging
-import os
-import re
 from enum import Enum
 from pathlib import Path
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@ -10,73 +8,28 @@ from pydantic import (
    BaseModel,
    ConfigDict,
    Field,
-    field_validator,
-    model_validator,
 )
-from pydantic_settings import BaseSettings, SettingsConfigDict
 from typing_extensions import deprecated

+# Import the following for backwards compatibility
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options_vlm_model import (
+    ApiVlmOptions,
+    InferenceFramework,
+    InlineVlmOptions,
+    ResponseFormat,
+)
+from docling.datamodel.vlm_model_specs import (
+    GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
+    GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
+    SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
+    SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
+    VlmModelType,
+)
+
 _log = logging.getLogger(__name__)


-class AcceleratorDevice(str, Enum):
-    """Devices to run model inference"""
-
-    AUTO = "auto"
-    CPU = "cpu"
-    CUDA = "cuda"
-    MPS = "mps"
-
-
-class AcceleratorOptions(BaseSettings):
-    model_config = SettingsConfigDict(
-        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
-    )
-
-    num_threads: int = 4
-    device: Union[str, AcceleratorDevice] = "auto"
-    cuda_use_flash_attention2: bool = False
-
-    @field_validator("device")
-    def validate_device(cls, value):
-        # "auto", "cpu", "cuda", "mps", or "cuda:N"
-        if value in {d.value for d in AcceleratorDevice} or re.match(
-            r"^cuda(:\d+)?$", value
-        ):
-            return value
-        raise ValueError(
-            "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
-        )
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_alternative_envvars(cls, data: Any) -> Any:
-        r"""
-        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
-        The alternative envvar is used only if it is valid and the regular envvar is not set.
-
-        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
-        the same functionality. In case the alias envvar is set and the user tries to override the
-        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
-        as an extra input instead of simply overwriting the evvar value for that parameter.
-        """
-        if isinstance(data, dict):
-            input_num_threads = data.get("num_threads")
-            # Check if to set the num_threads from the alternative envvar
-            if input_num_threads is None:
-                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
-                omp_num_threads = os.getenv("OMP_NUM_THREADS")
-                if docling_num_threads is None and omp_num_threads is not None:
-                    try:
-                        data["num_threads"] = int(omp_num_threads)
-                    except ValueError:
-                        _log.error(
-                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
-                            omp_num_threads,
-                        )
-        return data
-
-
 class BaseOptions(BaseModel):
    """Base class for options."""

@ -121,24 +74,22 @@ class RapidOcrOptions(OcrOptions):
    lang: List[str] = [
        "english",
        "chinese",
-    ]  # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
-    # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
+    ]
+    # However, language as a parameter is not supported by rapidocr yet
+    # and hence changing this options doesn't affect anything.
+
+    # For more details on supported languages by RapidOCR visit
+    # https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
+
+    # For more details on the following options visit
+    # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/

-    # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
    text_score: float = 0.5  # same default as rapidocr

    use_det: Optional[bool] = None  # same default as rapidocr
    use_cls: Optional[bool] = None  # same default as rapidocr
    use_rec: Optional[bool] = None  # same default as rapidocr

-    # class Device(Enum):
-    #     CPU = "CPU"
-    #     CUDA = "CUDA"
-    #     DIRECTML = "DIRECTML"
-    #     AUTO = "AUTO"
-
-    # device: Device = Device.AUTO  # Default value is AUTO
-
    print_verbose: bool = False  # same default as rapidocr

    det_model_path: Optional[str] = None  # same default as rapidocr
@ -244,101 +195,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
        return self.repo_id.replace("/", "--")


+# SmolVLM
 smolvlm_picture_description = PictureDescriptionVlmOptions(
    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
 )
-# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
+
+# GraniteVision
 granite_picture_description = PictureDescriptionVlmOptions(
    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
    prompt="What is shown in this image?",
 )


-class BaseVlmOptions(BaseModel):
-    kind: str
-    prompt: str
-
-
-class ResponseFormat(str, Enum):
-    DOCTAGS = "doctags"
-    MARKDOWN = "markdown"
-
-
-class InferenceFramework(str, Enum):
-    MLX = "mlx"
-    TRANSFORMERS = "transformers"
-    OPENAI = "openai"
-
-
-class HuggingFaceVlmOptions(BaseVlmOptions):
-    kind: Literal["hf_model_options"] = "hf_model_options"
-
-    repo_id: str
-    load_in_8bit: bool = True
-    llm_int8_threshold: float = 6.0
-    quantized: bool = False
-
-    inference_framework: InferenceFramework
-    response_format: ResponseFormat
-
-    @property
-    def repo_cache_folder(self) -> str:
-        return self.repo_id.replace("/", "--")
-
-
-class ApiVlmOptions(BaseVlmOptions):
-    kind: Literal["api_model_options"] = "api_model_options"
-
-    url: AnyUrl = AnyUrl(
-        "http://localhost:11434/v1/chat/completions"
-    )  # Default to ollama
-    headers: Dict[str, str] = {}
-    params: Dict[str, Any] = {}
-    scale: float = 2.0
-    timeout: float = 60
-    concurrency: int = 1
-    response_format: ResponseFormat
-
-
-smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
-    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
-    prompt="Convert this page to docling.",
-    response_format=ResponseFormat.DOCTAGS,
-    inference_framework=InferenceFramework.MLX,
-)
-
-
-smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="ds4sd/SmolDocling-256M-preview",
-    prompt="Convert this page to docling.",
-    response_format=ResponseFormat.DOCTAGS,
-    inference_framework=InferenceFramework.TRANSFORMERS,
-)
-
-granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
-    # prompt="OCR the full page to markdown.",
-    prompt="OCR this image.",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS,
-)
-
-granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
-    url=AnyUrl("http://localhost:11434/v1/chat/completions"),
-    params={"model": "granite3.2-vision:2b"},
-    prompt="OCR the full page to markdown.",
-    scale=1.0,
-    timeout=120,
-    response_format=ResponseFormat.MARKDOWN,
-)
-
-
-class VlmModelType(str, Enum):
-    SMOLDOCLING = "smoldocling"
-    GRANITE_VISION = "granite_vision"
-    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
-
-
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
    """Enum of valid PDF backends."""
@ -387,7 +255,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
        False  # (To be used with vlms, or other generative models)
    )
    # If True, text from backend will be used instead of generated text
-    vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
        smoldocling_vlm_conversion_options
    )

--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@ -0,0 +1,81 @@
+from enum import Enum
+from typing import Any, Dict, List, Literal
+
+from pydantic import AnyUrl, BaseModel
+from typing_extensions import deprecated
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+
+
+class BaseVlmOptions(BaseModel):
+    kind: str
+    prompt: str
+
+
+class ResponseFormat(str, Enum):
+    DOCTAGS = "doctags"
+    MARKDOWN = "markdown"
+    HTML = "html"
+
+
+class InferenceFramework(str, Enum):
+    MLX = "mlx"
+    TRANSFORMERS = "transformers"
+
+
+class TransformersModelType(str, Enum):
+    AUTOMODEL = "automodel"
+    AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
+    AUTOMODEL_CAUSALLM = "automodel-causallm"
+
+
+class InlineVlmOptions(BaseVlmOptions):
+    kind: Literal["inline_model_options"] = "inline_model_options"
+
+    repo_id: str
+    trust_remote_code: bool = False
+    load_in_8bit: bool = True
+    llm_int8_threshold: float = 6.0
+    quantized: bool = False
+
+    inference_framework: InferenceFramework
+    transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
+    response_format: ResponseFormat
+
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ]
+
+    scale: float = 2.0
+
+    temperature: float = 0.0
+    stop_strings: List[str] = []
+    extra_generation_config: Dict[str, Any] = {}
+
+    use_kv_cache: bool = True
+    max_new_tokens: int = 4096
+
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+
+
+@deprecated("Use InlineVlmOptions instead.")
+class HuggingFaceVlmOptions(InlineVlmOptions):
+    pass
+
+
+class ApiVlmOptions(BaseVlmOptions):
+    kind: Literal["api_model_options"] = "api_model_options"
+
+    url: AnyUrl = AnyUrl(
+        "http://localhost:11434/v1/chat/completions"
+    )  # Default to ollama
+    headers: Dict[str, str] = {}
+    params: Dict[str, Any] = {}
+    scale: float = 2.0
+    timeout: float = 60
+    concurrency: int = 1
+    response_format: ResponseFormat
--- a/docling/datamodel/vlm_model_specs.py
+++ b/docling/datamodel/vlm_model_specs.py
@ -0,0 +1,144 @@
+import logging
+from enum import Enum
+
+from pydantic import (
+    AnyUrl,
+)
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_vlm_model import (
+    ApiVlmOptions,
+    InferenceFramework,
+    InlineVlmOptions,
+    ResponseFormat,
+    TransformersModelType,
+)
+
+_log = logging.getLogger(__name__)
+
+
+# SmolDocling
+SMOLDOCLING_MLX = InlineVlmOptions(
+    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
+    prompt="Convert this page to docling.",
+    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.MLX,
+    supported_devices=[AcceleratorDevice.MPS],
+    scale=2.0,
+    temperature=0.0,
+)
+
+SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
+    repo_id="ds4sd/SmolDocling-256M-preview",
+    prompt="Convert this page to docling.",
+    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+    supported_devices=[
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
+
+# GraniteVision
+GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
+    repo_id="ibm-granite/granite-vision-3.2-2b",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+    supported_devices=[
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
+
+GRANITE_VISION_OLLAMA = ApiVlmOptions(
+    url=AnyUrl("http://localhost:11434/v1/chat/completions"),
+    params={"model": "granite3.2-vision:2b"},
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    scale=1.0,
+    timeout=120,
+    response_format=ResponseFormat.MARKDOWN,
+    temperature=0.0,
+)
+
+# Pixtral
+PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions(
+    repo_id="mistral-community/pixtral-12b",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+    supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
+    scale=2.0,
+    temperature=0.0,
+)
+
+PIXTRAL_12B_MLX = InlineVlmOptions(
+    repo_id="mlx-community/pixtral-12b-bf16",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.MLX,
+    supported_devices=[AcceleratorDevice.MPS],
+    scale=2.0,
+    temperature=0.0,
+)
+
+# Phi4
+PHI4_TRANSFORMERS = InlineVlmOptions(
+    repo_id="microsoft/Phi-4-multimodal-instruct",
+    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
+    trust_remote_code=True,
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_CAUSALLM,
+    supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
+    scale=2.0,
+    temperature=0.0,
+    extra_generation_config=dict(num_logits_to_keep=0),
+)
+
+# Qwen
+QWEN25_VL_3B_MLX = InlineVlmOptions(
+    repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.MLX,
+    supported_devices=[AcceleratorDevice.MPS],
+    scale=2.0,
+    temperature=0.0,
+)
+
+# Gemma-3
+GEMMA3_12B_MLX = InlineVlmOptions(
+    repo_id="mlx-community/gemma-3-12b-it-bf16",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.MLX,
+    supported_devices=[AcceleratorDevice.MPS],
+    scale=2.0,
+    temperature=0.0,
+)
+
+GEMMA3_27B_MLX = InlineVlmOptions(
+    repo_id="mlx-community/gemma-3-27b-it-bf16",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.MLX,
+    supported_devices=[AcceleratorDevice.MPS],
+    scale=2.0,
+    temperature=0.0,
+)
+
+
+class VlmModelType(str, Enum):
+    SMOLDOCLING = "smoldocling"
+    GRANITE_VISION = "granite_vision"
+    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -186,6 +186,11 @@ class DocumentConverter:
            Tuple[Type[BasePipeline], str], BasePipeline
        ] = {}

+    def _get_initialized_pipelines(
+        self,
+    ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
+        return self.initialized_pipelines
+
    def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
        """Generate a hash of pipeline options to use as part of the cache key."""
        options_str = str(pipeline_options.model_dump())
--- a/docling/models/api_vlm_model.py
+++ b/docling/models/api_vlm_model.py
@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor

 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import ApiVlmOptions
+from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
 from docling.exceptions import OperationNotAllowed
 from docling.models.base_model import BasePageModel
 from docling.utils.api_image_request import api_image_request
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -11,9 +11,10 @@ from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label

+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
+from docling.datamodel.pipeline_options import OcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BaseModelWithOptions, BasePageModel

--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@ -16,9 +16,10 @@ from docling_core.types.doc.labels import CodeLanguageLabel
 from PIL import Image, ImageOps
 from pydantic import BaseModel

+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement
-from docling.datamodel.pipeline_options import AcceleratorOptions
 from docling.models.base_model import BaseItemAndImageEnrichmentModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device


@ -117,20 +118,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
        force: bool = False,
        progress: bool = False,
    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
            repo_id="ds4sd/CodeFormula",
-            force_download=force,
-            local_dir=local_dir,
            revision="v1.0.2",
+            local_dir=local_dir,
+            force=force,
+            progress=progress,
        )

-        return Path(download_path)
-
    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
        """
        Determines if a given element in a document can be processed by the model.
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@ -13,8 +13,9 @@ from docling_core.types.doc import (
 from PIL import Image
 from pydantic import BaseModel

-from docling.datamodel.pipeline_options import AcceleratorOptions
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.models.base_model import BaseEnrichmentModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device


@ -105,20 +106,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
    def download_models(
        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
            repo_id="ds4sd/DocumentFigureClassifier",
-            force_download=force,
-            local_dir=local_dir,
            revision="v1.0.1",
+            local_dir=local_dir,
+            force=force,
+            progress=progress,
        )

-        return Path(download_path)
-
    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
        """
        Determines if the given element can be processed by the classifier.
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -9,11 +9,10 @@ import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell

+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
    EasyOcrOptions,
    OcrOptions,
 )
--- a/docling/models/hf_vlm_model.py
+++ b/docling/models/hf_vlm_model.py
@ -1,182 +0,0 @@
-import logging
-import time
-from collections.abc import Iterable
-from pathlib import Path
-from typing import Optional
-
-from docling.datamodel.base_models import Page, VlmPrediction
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
-    HuggingFaceVlmOptions,
-)
-from docling.models.base_model import BasePageModel
-from docling.utils.accelerator_utils import decide_device
-from docling.utils.profiling import TimeRecorder
-
-_log = logging.getLogger(__name__)
-
-
-class HuggingFaceVlmModel(BasePageModel):
-    def __init__(
-        self,
-        enabled: bool,
-        artifacts_path: Optional[Path],
-        accelerator_options: AcceleratorOptions,
-        vlm_options: HuggingFaceVlmOptions,
-    ):
-        self.enabled = enabled
-
-        self.vlm_options = vlm_options
-
-        if self.enabled:
-            import torch
-            from transformers import (  # type: ignore
-                AutoModelForVision2Seq,
-                AutoProcessor,
-                BitsAndBytesConfig,
-            )
-
-            device = decide_device(accelerator_options.device)
-            self.device = device
-
-            _log.debug(f"Available device for HuggingFace VLM: {device}")
-
-            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
-
-            # PARAMETERS:
-            if artifacts_path is None:
-                artifacts_path = self.download_models(self.vlm_options.repo_id)
-            elif (artifacts_path / repo_cache_folder).exists():
-                artifacts_path = artifacts_path / repo_cache_folder
-
-            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
-            self.param_quantization_config = BitsAndBytesConfig(
-                load_in_8bit=vlm_options.load_in_8bit,  # True,
-                llm_int8_threshold=vlm_options.llm_int8_threshold,  # 6.0
-            )
-            self.param_quantized = vlm_options.quantized  # False
-
-            self.processor = AutoProcessor.from_pretrained(artifacts_path)
-            if not self.param_quantized:
-                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
-                    artifacts_path,
-                    device_map=device,
-                    torch_dtype=torch.bfloat16,
-                    _attn_implementation=(
-                        "flash_attention_2"
-                        if self.device.startswith("cuda")
-                        and accelerator_options.cuda_use_flash_attention2
-                        else "eager"
-                    ),
-                )  # .to(self.device)
-
-            else:
-                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
-                    artifacts_path,
-                    device_map=device,
-                    torch_dtype="auto",
-                    quantization_config=self.param_quantization_config,
-                    _attn_implementation=(
-                        "flash_attention_2"
-                        if self.device.startswith("cuda")
-                        and accelerator_options.cuda_use_flash_attention2
-                        else "eager"
-                    ),
-                )  # .to(self.device)
-
-    @staticmethod
-    def download_models(
-        repo_id: str,
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id=repo_id,
-            force_download=force,
-            local_dir=local_dir,
-            # revision="v0.0.1",
-        )
-
-        return Path(download_path)
-
-    def __call__(
-        self, conv_res: ConversionResult, page_batch: Iterable[Page]
-    ) -> Iterable[Page]:
-        for page in page_batch:
-            assert page._backend is not None
-            if not page._backend.is_valid():
-                yield page
-            else:
-                with TimeRecorder(conv_res, "vlm"):
-                    assert page.size is not None
-
-                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
-                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
-
-                    if hi_res_image is not None:
-                        im_width, im_height = hi_res_image.size
-
-                    # populate page_tags with predicted doc tags
-                    page_tags = ""
-
-                    if hi_res_image:
-                        if hi_res_image.mode != "RGB":
-                            hi_res_image = hi_res_image.convert("RGB")
-
-                    messages = [
-                        {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "text",
-                                    "text": "This is a page from a document.",
-                                },
-                                {"type": "image"},
-                                {"type": "text", "text": self.param_question},
-                            ],
-                        }
-                    ]
-                    prompt = self.processor.apply_chat_template(
-                        messages, add_generation_prompt=False
-                    )
-                    inputs = self.processor(
-                        text=prompt, images=[hi_res_image], return_tensors="pt"
-                    )
-                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
-
-                    start_time = time.time()
-                    # Call model to generate:
-                    generated_ids = self.vlm_model.generate(
-                        **inputs, max_new_tokens=4096, use_cache=True
-                    )
-
-                    generation_time = time.time() - start_time
-                    generated_texts = self.processor.batch_decode(
-                        generated_ids[:, inputs["input_ids"].shape[1] :],
-                        skip_special_tokens=False,
-                    )[0]
-
-                    num_tokens = len(generated_ids[0])
-                    page_tags = generated_texts
-
-                    _log.debug(
-                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
-                    )
-
-                    # inference_time = time.time() - start_time
-                    # tokens_per_second = num_tokens / generation_time
-                    # print("")
-                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
-                    # print(f"Total tokens on page: {num_tokens:.2f}")
-                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
-                    # print("")
-                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
-
-                yield page
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -10,11 +10,12 @@ from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import Image

+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
 from docling.utils.profiling import TimeRecorder
@ -83,20 +84,14 @@ class LayoutModel(BasePageModel):
        force: bool = False,
        progress: bool = False,
    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
            repo_id="ds4sd/docling-models",
-            force_download=force,
+            revision="v2.2.0",
            local_dir=local_dir,
-            revision="v2.1.0",
+            force=force,
+            progress=progress,
        )

-        return Path(download_path)
-
    def draw_clusters_and_cells_side_by_side(
        self, conv_res, page, clusters, mode_prefix: str, show: bool = False
    ):
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@ -8,10 +8,10 @@ from typing import Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell

+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
    OcrMacOptions,
    OcrOptions,
 )
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@ -5,8 +5,8 @@ from typing import Optional, Type, Union

 from PIL import Image

+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
    PictureDescriptionApiOptions,
    PictureDescriptionBaseOptions,
 )
--- a/docling/models/picture_description_base_model.py
+++ b/docling/models/picture_description_base_model.py
@ -13,8 +13,8 @@ from docling_core.types.doc.document import (  # TODO: move import to docling_co
 )
 from PIL import Image

+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
    PictureDescriptionBaseOptions,
 )
 from docling.models.base_model import (
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@ -4,16 +4,21 @@ from typing import Optional, Type, Union

 from PIL import Image

+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
    PictureDescriptionBaseOptions,
    PictureDescriptionVlmOptions,
 )
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
 from docling.utils.accelerator_utils import decide_device


-class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
+class PictureDescriptionVlmModel(
+    PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
+):
    @classmethod
    def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
        return PictureDescriptionVlmOptions
@ -66,26 +71,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):

            self.provenance = f"{self.options.repo_id}"

-    @staticmethod
-    def download_models(
-        repo_id: str,
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id=repo_id,
-            force_download=force,
-            local_dir=local_dir,
-        )
-
-        return Path(download_path)
-
    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
        from transformers import GenerationConfig

--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -7,11 +7,10 @@ import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell

+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
    OcrOptions,
    RapidOcrOptions,
 )
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -13,16 +13,16 @@ from docling_core.types.doc.page import (
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw

+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
    TableFormerMode,
    TableStructureOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder

@ -90,20 +90,14 @@ class TableStructureModel(BasePageModel):
    def download_models(
        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
            repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
            revision="v2.2.0",
+            local_dir=local_dir,
+            force=force,
+            progress=progress,
        )

-        return Path(download_path)
-
    def draw_table_and_cells(
        self,
        conv_res: ConversionResult,
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -13,10 +13,10 @@ import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import TextCell

+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
    OcrOptions,
    TesseractCliOcrOptions,
 )
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -7,10 +7,10 @@ from typing import Iterable, Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import TextCell

+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
    OcrOptions,
    TesseractOcrOptions,
 )
--- a/docling/models/utils/init.py
+++ b/docling/models/utils/init.py
--- a/docling/models/utils/hf_model_download.py
+++ b/docling/models/utils/hf_model_download.py
@ -0,0 +1,40 @@
+import logging
+from pathlib import Path
+from typing import Optional
+
+_log = logging.getLogger(__name__)
+
+
+def download_hf_model(
+    repo_id: str,
+    local_dir: Optional[Path] = None,
+    force: bool = False,
+    progress: bool = False,
+    revision: Optional[str] = None,
+) -> Path:
+    from huggingface_hub import snapshot_download
+    from huggingface_hub.utils import disable_progress_bars
+
+    if not progress:
+        disable_progress_bars()
+    download_path = snapshot_download(
+        repo_id=repo_id,
+        force_download=force,
+        local_dir=local_dir,
+        revision=revision,
+    )
+
+    return Path(download_path)
+
+
+class HuggingFaceModelDownloadMixin:
+    @staticmethod
+    def download_models(
+        repo_id: str,
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        return download_hf_model(
+            repo_id=repo_id, local_dir=local_dir, force=force, progress=progress
+        )
--- a/docling/models/vlm_models_inline/init.py
+++ b/docling/models/vlm_models_inline/init.py
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@ -0,0 +1,194 @@
+import importlib.metadata
+import logging
+import time
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any, Optional
+
+from docling.datamodel.accelerator_options import (
+    AcceleratorOptions,
+)
+from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options_vlm_model import (
+    InlineVlmOptions,
+    TransformersModelType,
+)
+from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        vlm_options: InlineVlmOptions,
+    ):
+        self.enabled = enabled
+
+        self.vlm_options = vlm_options
+
+        if self.enabled:
+            import torch
+            from transformers import (
+                AutoModel,
+                AutoModelForCausalLM,
+                AutoModelForVision2Seq,
+                AutoProcessor,
+                BitsAndBytesConfig,
+                GenerationConfig,
+            )
+
+            transformers_version = importlib.metadata.version("transformers")
+            if (
+                self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct"
+                and transformers_version >= "4.52.0"
+            ):
+                raise NotImplementedError(
+                    f"Phi 4 only works with transformers<4.52.0 but you have {transformers_version=}. Please downgrage running pip install -U 'transformers<4.52.0'."
+                )
+
+            self.device = decide_device(
+                accelerator_options.device,
+                supported_devices=vlm_options.supported_devices,
+            )
+            _log.debug(f"Available device for VLM: {self.device}")
+
+            self.use_cache = vlm_options.use_kv_cache
+            self.max_new_tokens = vlm_options.max_new_tokens
+            self.temperature = vlm_options.temperature
+
+            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+
+            if artifacts_path is None:
+                artifacts_path = self.download_models(self.vlm_options.repo_id)
+            elif (artifacts_path / repo_cache_folder).exists():
+                artifacts_path = artifacts_path / repo_cache_folder
+
+            self.param_quantization_config: Optional[BitsAndBytesConfig] = None
+            if vlm_options.quantized:
+                self.param_quantization_config = BitsAndBytesConfig(
+                    load_in_8bit=vlm_options.load_in_8bit,
+                    llm_int8_threshold=vlm_options.llm_int8_threshold,
+                )
+
+            model_cls: Any = AutoModel
+            if (
+                self.vlm_options.transformers_model_type
+                == TransformersModelType.AUTOMODEL_CAUSALLM
+            ):
+                model_cls = AutoModelForCausalLM
+            elif (
+                self.vlm_options.transformers_model_type
+                == TransformersModelType.AUTOMODEL_VISION2SEQ
+            ):
+                model_cls = AutoModelForVision2Seq
+
+            self.processor = AutoProcessor.from_pretrained(
+                artifacts_path,
+                trust_remote_code=vlm_options.trust_remote_code,
+            )
+            self.vlm_model = model_cls.from_pretrained(
+                artifacts_path,
+                device_map=self.device,
+                _attn_implementation=(
+                    "flash_attention_2"
+                    if self.device.startswith("cuda")
+                    and accelerator_options.cuda_use_flash_attention2
+                    else "eager"
+                ),
+                trust_remote_code=vlm_options.trust_remote_code,
+            )
+
+            # Load generation config
+            self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
+
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "vlm"):
+                    assert page.size is not None
+
+                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+
+                    # Define prompt structure
+                    prompt = self.formulate_prompt()
+
+                    inputs = self.processor(
+                        text=prompt, images=[hi_res_image], return_tensors="pt"
+                    ).to(self.device)
+
+                    start_time = time.time()
+                    # Call model to generate:
+                    generated_ids = self.vlm_model.generate(
+                        **inputs,
+                        max_new_tokens=self.max_new_tokens,
+                        use_cache=self.use_cache,
+                        temperature=self.temperature,
+                        generation_config=self.generation_config,
+                        **self.vlm_options.extra_generation_config,
+                    )
+
+                    generation_time = time.time() - start_time
+                    generated_texts = self.processor.batch_decode(
+                        generated_ids[:, inputs["input_ids"].shape[1] :],
+                        skip_special_tokens=False,
+                    )[0]
+
+                    num_tokens = len(generated_ids[0])
+                    _log.debug(
+                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
+                    )
+                    page.predictions.vlm_response = VlmPrediction(
+                        text=generated_texts,
+                        generation_time=generation_time,
+                    )
+
+                yield page
+
+    def formulate_prompt(self) -> str:
+        """Formulate a prompt for the VLM."""
+
+        if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
+            _log.debug("Using specialized prompt for Phi-4")
+            # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
+
+            user_prompt = "<|user|>"
+            assistant_prompt = "<|assistant|>"
+            prompt_suffix = "<|end|>"
+
+            prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
+            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
+
+            return prompt
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "This is a page from a document.",
+                    },
+                    {"type": "image"},
+                    {"type": "text", "text": self.vlm_options.prompt},
+                ],
+            }
+        ]
+        prompt = self.processor.apply_chat_template(
+            messages, add_generation_prompt=False
+        )
+        return prompt
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@ -4,29 +4,34 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional

-from docling.datamodel.base_models import Page, VlmPrediction
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
+from docling.datamodel.accelerator_options import (
    AcceleratorOptions,
-    HuggingFaceVlmOptions,
 )
+from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
 from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
 from docling.utils.profiling import TimeRecorder

 _log = logging.getLogger(__name__)


-class HuggingFaceMlxModel(BasePageModel):
+class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
    def __init__(
        self,
        enabled: bool,
        artifacts_path: Optional[Path],
        accelerator_options: AcceleratorOptions,
-        vlm_options: HuggingFaceVlmOptions,
+        vlm_options: InlineVlmOptions,
    ):
        self.enabled = enabled

        self.vlm_options = vlm_options
+        self.max_tokens = vlm_options.max_new_tokens
+        self.temperature = vlm_options.temperature

        if self.enabled:
            try:
@ -39,42 +44,24 @@ class HuggingFaceMlxModel(BasePageModel):
                )

            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+
            self.apply_chat_template = apply_chat_template
            self.stream_generate = stream_generate

            # PARAMETERS:
            if artifacts_path is None:
-                artifacts_path = self.download_models(self.vlm_options.repo_id)
+                artifacts_path = self.download_models(
+                    self.vlm_options.repo_id,
+                )
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder

-            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
+            self.param_question = vlm_options.prompt

            ## Load the model
            self.vlm_model, self.processor = load(artifacts_path)
            self.config = load_config(artifacts_path)

-    @staticmethod
-    def download_models(
-        repo_id: str,
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id=repo_id,
-            force_download=force,
-            local_dir=local_dir,
-            # revision="v0.0.1",
-        )
-
-        return Path(download_path)
-
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
@ -83,12 +70,10 @@ class HuggingFaceMlxModel(BasePageModel):
            if not page._backend.is_valid():
                yield page
            else:
-                with TimeRecorder(conv_res, "vlm"):
+                with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
                    assert page.size is not None

-                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
-                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
-
+                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size

@ -104,16 +89,45 @@ class HuggingFaceMlxModel(BasePageModel):
                    )

                    start_time = time.time()
+                    _log.debug("start generating ...")
+
                    # Call model to generate:
+                    tokens: list[VlmPredictionToken] = []
+
                    output = ""
                    for token in self.stream_generate(
                        self.vlm_model,
                        self.processor,
                        prompt,
                        [hi_res_image],
-                        max_tokens=4096,
+                        max_tokens=self.max_tokens,
                        verbose=False,
+                        temp=self.temperature,
                    ):
+                        if len(token.logprobs.shape) == 1:
+                            tokens.append(
+                                VlmPredictionToken(
+                                    text=token.text,
+                                    token=token.token,
+                                    logprob=token.logprobs[token.token],
+                                )
+                            )
+                        elif (
+                            len(token.logprobs.shape) == 2
+                            and token.logprobs.shape[0] == 1
+                        ):
+                            tokens.append(
+                                VlmPredictionToken(
+                                    text=token.text,
+                                    token=token.token,
+                                    logprob=token.logprobs[0, token.token],
+                                )
+                            )
+                        else:
+                            _log.warning(
+                                f"incompatible shape for logprobs: {token.logprobs.shape}"
+                            )
+
                        output += token.text
                        if "</doctag>" in token.text:
                            break
@ -121,15 +135,13 @@ class HuggingFaceMlxModel(BasePageModel):
                    generation_time = time.time() - start_time
                    page_tags = output

-                    _log.debug(f"Generation time {generation_time:.2f} seconds.")
-
-                    # inference_time = time.time() - start_time
-                    # tokens_per_second = num_tokens / generation_time
-                    # print("")
-                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
-                    # print(f"Total tokens on page: {num_tokens:.2f}")
-                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
-                    # print("")
-                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
+                    _log.debug(
+                        f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
+                    )
+                    page.predictions.vlm_response = VlmPrediction(
+                        text=page_tags,
+                        generation_time=generation_time,
+                        generated_tokens=tokens,
+                    )

                yield page
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -1,29 +1,46 @@
 import logging
+import re
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast

-from docling_core.types import DoclingDocument
-from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItem,
+    DoclingDocument,
+    ImageRef,
+    PictureItem,
+    ProvenanceItem,
+    TextItem,
+)
+from docling_core.types.doc.base import (
+    BoundingBox,
+    Size,
+)
 from docling_core.types.doc.document import DocTagsDocument
 from PIL import Image as PILImage

 from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import (
-    ApiVlmOptions,
-    HuggingFaceVlmOptions,
-    InferenceFramework,
-    ResponseFormat,
    VlmPipelineOptions,
 )
+from docling.datamodel.pipeline_options_vlm_model import (
+    ApiVlmOptions,
+    InferenceFramework,
+    InlineVlmOptions,
+    ResponseFormat,
+)
 from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel
-from docling.models.hf_mlx_model import HuggingFaceMlxModel
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
+from docling.models.vlm_models_inline.hf_transformers_model import (
+    HuggingFaceTransformersVlmModel,
+)
+from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder

@ -66,8 +83,8 @@ class VlmPipeline(PaginatedPipeline):
                    vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
                ),
            ]
-        elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
-            vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
+        elif isinstance(self.pipeline_options.vlm_options, InlineVlmOptions):
+            vlm_options = cast(InlineVlmOptions, self.pipeline_options.vlm_options)
            if vlm_options.inference_framework == InferenceFramework.MLX:
                self.build_pipe = [
                    HuggingFaceMlxModel(
@ -77,15 +94,19 @@ class VlmPipeline(PaginatedPipeline):
                        vlm_options=vlm_options,
                    ),
                ]
-            else:
+            elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
                self.build_pipe = [
-                    HuggingFaceVlmModel(
+                    HuggingFaceTransformersVlmModel(
                        enabled=True,  # must be always enabled for this pipeline to make sense.
                        artifacts_path=artifacts_path,
                        accelerator_options=pipeline_options.accelerator_options,
                        vlm_options=vlm_options,
                    ),
                ]
+            else:
+                raise ValueError(
+                    f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
+                )

        self.enrichment_pipe = [
            # Other models working on `NodeItem` elements in the DoclingDocument
@ -116,49 +137,19 @@ class VlmPipeline(PaginatedPipeline):
                self.pipeline_options.vlm_options.response_format
                == ResponseFormat.DOCTAGS
            ):
-                doctags_list = []
-                image_list = []
-                for page in conv_res.pages:
-                    predicted_doctags = ""
-                    img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
-                    if page.predictions.vlm_response:
-                        predicted_doctags = page.predictions.vlm_response.text
-                    if page.image:
-                        img = page.image
-                    image_list.append(img)
-                    doctags_list.append(predicted_doctags)
+                conv_res.document = self._turn_dt_into_doc(conv_res)

-                doctags_list_c = cast(List[Union[Path, str]], doctags_list)
-                image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
-                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
-                    doctags_list_c, image_list_c
-                )
-                conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
-
-                # If forced backend text, replace model predicted text with backend one
-                if self.force_backend_text:
-                    scale = self.pipeline_options.images_scale
-                    for element, _level in conv_res.document.iterate_items():
-                        if not isinstance(element, TextItem) or len(element.prov) == 0:
-                            continue
-                        page_ix = element.prov[0].page_no - 1
-                        page = conv_res.pages[page_ix]
-                        if not page.size:
-                            continue
-                        crop_bbox = (
-                            element.prov[0]
-                            .bbox.scaled(scale=scale)
-                            .to_top_left_origin(page_height=page.size.height * scale)
-                        )
-                        txt = self.extract_text_from_backend(page, crop_bbox)
-                        element.text = txt
-                        element.orig = txt
            elif (
                self.pipeline_options.vlm_options.response_format
                == ResponseFormat.MARKDOWN
            ):
                conv_res.document = self._turn_md_into_doc(conv_res)

+            elif (
+                self.pipeline_options.vlm_options.response_format == ResponseFormat.HTML
+            ):
+                conv_res.document = self._turn_html_into_doc(conv_res)
+
            else:
                raise RuntimeError(
                    f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
@ -192,23 +183,199 @@ class VlmPipeline(PaginatedPipeline):

        return conv_res

-    def _turn_md_into_doc(self, conv_res):
-        predicted_text = ""
-        for pg_idx, page in enumerate(conv_res.pages):
+    def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
+        doctags_list = []
+        image_list = []
+        for page in conv_res.pages:
+            predicted_doctags = ""
+            img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
            if page.predictions.vlm_response:
-                predicted_text += page.predictions.vlm_response.text + "\n\n"
-        response_bytes = BytesIO(predicted_text.encode("utf8"))
-        out_doc = InputDocument(
-            path_or_stream=response_bytes,
-            filename=conv_res.input.file.name,
-            format=InputFormat.MD,
-            backend=MarkdownDocumentBackend,
+                predicted_doctags = page.predictions.vlm_response.text
+            if page.image:
+                img = page.image
+            image_list.append(img)
+            doctags_list.append(predicted_doctags)
+
+        doctags_list_c = cast(List[Union[Path, str]], doctags_list)
+        image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
+        doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
+            doctags_list_c, image_list_c
        )
-        backend = MarkdownDocumentBackend(
-            in_doc=out_doc,
-            path_or_stream=response_bytes,
+        conv_res.document = DoclingDocument.load_from_doctags(
+            doctag_document=doctags_doc
        )
-        return backend.convert()
+
+        # If forced backend text, replace model predicted text with backend one
+        if page.size:
+            if self.force_backend_text:
+                scale = self.pipeline_options.images_scale
+                for element, _level in conv_res.document.iterate_items():
+                    if not isinstance(element, TextItem) or len(element.prov) == 0:
+                        continue
+                    crop_bbox = (
+                        element.prov[0]
+                        .bbox.scaled(scale=scale)
+                        .to_top_left_origin(page_height=page.size.height * scale)
+                    )
+                    txt = self.extract_text_from_backend(page, crop_bbox)
+                    element.text = txt
+                    element.orig = txt
+
+        return conv_res.document
+
+    def _turn_md_into_doc(self, conv_res):
+        def _extract_markdown_code(text):
+            """
+            Extracts text from markdown code blocks (enclosed in triple backticks).
+            If no code blocks are found, returns the original text.
+
+            Args:
+                text (str): Input text that may contain markdown code blocks
+
+            Returns:
+                str: Extracted code if code blocks exist, otherwise original text
+            """
+            # Regex pattern to match content between triple backticks
+            # This handles multiline content and optional language specifier
+            pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
+
+            # Search with DOTALL flag to match across multiple lines
+            mtch = re.search(pattern, text, re.DOTALL)
+
+            if mtch:
+                # Return only the content of the first capturing group
+                return mtch.group(1)
+            else:
+                # No code blocks found, return original text
+                return text
+
+        for pg_idx, page in enumerate(conv_res.pages):
+            page_no = pg_idx + 1  # FIXME: might be incorrect
+
+            predicted_text = ""
+            if page.predictions.vlm_response:
+                predicted_text = page.predictions.vlm_response.text + "\n\n"
+
+            predicted_text = _extract_markdown_code(text=predicted_text)
+
+            response_bytes = BytesIO(predicted_text.encode("utf8"))
+            out_doc = InputDocument(
+                path_or_stream=response_bytes,
+                filename=conv_res.input.file.name,
+                format=InputFormat.MD,
+                backend=MarkdownDocumentBackend,
+            )
+            backend = MarkdownDocumentBackend(
+                in_doc=out_doc,
+                path_or_stream=response_bytes,
+            )
+            page_doc = backend.convert()
+
+            if page.image is not None:
+                pg_width = page.image.width
+                pg_height = page.image.height
+            else:
+                pg_width = 1
+                pg_height = 1
+
+            conv_res.document.add_page(
+                page_no=page_no,
+                size=Size(width=pg_width, height=pg_height),
+                image=ImageRef.from_pil(image=page.image, dpi=72)
+                if page.image
+                else None,
+            )
+
+            for item, level in page_doc.iterate_items():
+                item.prov = [
+                    ProvenanceItem(
+                        page_no=pg_idx + 1,
+                        bbox=BoundingBox(
+                            t=0.0, b=0.0, l=0.0, r=0.0
+                        ),  # FIXME: would be nice not to have to "fake" it
+                        charspan=[0, 0],
+                    )
+                ]
+                conv_res.document.append_child_item(child=item)
+
+        return conv_res.document
+
+    def _turn_html_into_doc(self, conv_res):
+        def _extract_html_code(text):
+            """
+            Extracts text from markdown code blocks (enclosed in triple backticks).
+            If no code blocks are found, returns the original text.
+
+            Args:
+                text (str): Input text that may contain markdown code blocks
+
+            Returns:
+                str: Extracted code if code blocks exist, otherwise original text
+            """
+            # Regex pattern to match content between triple backticks
+            # This handles multiline content and optional language specifier
+            pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
+
+            # Search with DOTALL flag to match across multiple lines
+            mtch = re.search(pattern, text, re.DOTALL)
+
+            if mtch:
+                # Return only the content of the first capturing group
+                return mtch.group(1)
+            else:
+                # No code blocks found, return original text
+                return text
+
+        for pg_idx, page in enumerate(conv_res.pages):
+            page_no = pg_idx + 1  # FIXME: might be incorrect
+
+            predicted_text = ""
+            if page.predictions.vlm_response:
+                predicted_text = page.predictions.vlm_response.text + "\n\n"
+
+            predicted_text = _extract_html_code(text=predicted_text)
+
+            response_bytes = BytesIO(predicted_text.encode("utf8"))
+            out_doc = InputDocument(
+                path_or_stream=response_bytes,
+                filename=conv_res.input.file.name,
+                format=InputFormat.MD,
+                backend=HTMLDocumentBackend,
+            )
+            backend = HTMLDocumentBackend(
+                in_doc=out_doc,
+                path_or_stream=response_bytes,
+            )
+            page_doc = backend.convert()
+
+            if page.image is not None:
+                pg_width = page.image.width
+                pg_height = page.image.height
+            else:
+                pg_width = 1
+                pg_height = 1
+
+            conv_res.document.add_page(
+                page_no=page_no,
+                size=Size(width=pg_width, height=pg_height),
+                image=ImageRef.from_pil(image=page.image, dpi=72)
+                if page.image
+                else None,
+            )
+
+            for item, level in page_doc.iterate_items():
+                item.prov = [
+                    ProvenanceItem(
+                        page_no=pg_idx + 1,
+                        bbox=BoundingBox(
+                            t=0.0, b=0.0, l=0.0, r=0.0
+                        ),  # FIXME: would be nice not to have to "fake" it
+                        charspan=[0, 0],
+                    )
+                ]
+                conv_res.document.append_child_item(child=item)
+
+        return conv_res.document

    @classmethod
    def get_default_options(cls) -> VlmPipelineOptions:
--- a/docling/utils/accelerator_utils.py
+++ b/docling/utils/accelerator_utils.py
@ -1,13 +1,16 @@
 import logging
+from typing import List, Optional

 import torch

-from docling.datamodel.pipeline_options import AcceleratorDevice
+from docling.datamodel.accelerator_options import AcceleratorDevice

 _log = logging.getLogger(__name__)


-def decide_device(accelerator_device: str) -> str:
+def decide_device(
+    accelerator_device: str, supported_devices: Optional[List[AcceleratorDevice]] = None
+) -> str:
    r"""
    Resolve the device based on the acceleration options and the available devices in the system.

@ -20,6 +23,18 @@ def decide_device(accelerator_device: str) -> str:
    has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
    has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()

+    if supported_devices is not None:
+        if has_cuda and AcceleratorDevice.CUDA not in supported_devices:
+            _log.info(
+                f"Removing CUDA from available devices because it is not in {supported_devices=}"
+            )
+            has_cuda = False
+        if has_mps and AcceleratorDevice.MPS not in supported_devices:
+            _log.info(
+                f"Removing MPS from available devices because it is not in {supported_devices=}"
+            )
+            has_mps = False
+
    if accelerator_device == AcceleratorDevice.AUTO.value:  # Handle 'auto'
        if has_cuda:
            device = "cuda:0"
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@ -4,18 +4,20 @@ from typing import Optional

 from docling.datamodel.pipeline_options import (
    granite_picture_description,
-    smoldocling_vlm_conversion_options,
-    smoldocling_vlm_mlx_conversion_options,
    smolvlm_picture_description,
 )
 from docling.datamodel.settings import settings
+from docling.datamodel.vlm_model_specs import (
+    SMOLDOCLING_MLX,
+    SMOLDOCLING_TRANSFORMERS,
+)
 from docling.models.code_formula_model import CodeFormulaModel
 from docling.models.document_picture_classifier import DocumentPictureClassifier
 from docling.models.easyocr_model import EasyOcrModel
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.models.layout_model import LayoutModel
 from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.table_structure_model import TableStructureModel
+from docling.models.utils.hf_model_download import download_hf_model

 _log = logging.getLogger(__name__)

@ -75,7 +77,7 @@ def download_models(

    if with_smolvlm:
        _log.info("Downloading SmolVlm model...")
-        PictureDescriptionVlmModel.download_models(
+        download_hf_model(
            repo_id=smolvlm_picture_description.repo_id,
            local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
            force=force,
@ -84,26 +86,25 @@ def download_models(

    if with_smoldocling:
        _log.info("Downloading SmolDocling model...")
-        HuggingFaceVlmModel.download_models(
-            repo_id=smoldocling_vlm_conversion_options.repo_id,
-            local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
+        download_hf_model(
+            repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
+            local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
            force=force,
            progress=progress,
        )

    if with_smoldocling_mlx:
        _log.info("Downloading SmolDocling MLX model...")
-        HuggingFaceVlmModel.download_models(
-            repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
-            local_dir=output_dir
-            / smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
+        download_hf_model(
+            repo_id=SMOLDOCLING_MLX.repo_id,
+            local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
            force=force,
            progress=progress,
        )

    if with_granite_vision:
        _log.info("Downloading Granite Vision model...")
-        PictureDescriptionVlmModel.download_models(
+        download_hf_model(
            repo_id=granite_picture_description.repo_id,
            local_dir=output_dir / granite_picture_description.repo_cache_folder,
            force=force,
--- a/docs/examples/compare_vlm_models.py
+++ b/docs/examples/compare_vlm_models.py
@ -0,0 +1,160 @@
+# Compare VLM models
+# ==================
+#
+# This example runs the VLM pipeline with different vision-language models.
+# Their runtime as well output quality is compared.
+
+import json
+import sys
+import time
+from pathlib import Path
+
+from docling_core.types.doc import DocItemLabel, ImageRefMode
+from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
+from tabulate import tabulate
+
+from docling.datamodel import vlm_model_specs
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    VlmPipelineOptions,
+)
+from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.pipeline.vlm_pipeline import VlmPipeline
+
+
+def convert(sources: list[Path], converter: DocumentConverter):
+    model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
+    framework = pipeline_options.vlm_options.inference_framework
+    for source in sources:
+        print("================================================")
+        print("Processing...")
+        print(f"Source: {source}")
+        print("---")
+        print(f"Model: {model_id}")
+        print(f"Framework: {framework}")
+        print("================================================")
+        print("")
+
+        res = converter.convert(source)
+
+        print("")
+
+        fname = f"{res.input.file.stem}-{model_id}-{framework}"
+
+        inference_time = 0.0
+        for i, page in enumerate(res.pages):
+            inference_time += page.predictions.vlm_response.generation_time
+            print("")
+            print(
+                f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format} in {page.predictions.vlm_response.generation_time} [sec]:"
+            )
+            print(page.predictions.vlm_response.text)
+            print(" ---------- ")
+
+        print("===== Final output of the converted document =======")
+
+        with (out_path / f"{fname}.json").open("w") as fp:
+            fp.write(json.dumps(res.document.export_to_dict()))
+
+        res.document.save_as_json(
+            out_path / f"{fname}.json",
+            image_mode=ImageRefMode.PLACEHOLDER,
+        )
+        print(f" => produced {out_path / fname}.json")
+
+        res.document.save_as_markdown(
+            out_path / f"{fname}.md",
+            image_mode=ImageRefMode.PLACEHOLDER,
+        )
+        print(f" => produced {out_path / fname}.md")
+
+        res.document.save_as_html(
+            out_path / f"{fname}.html",
+            image_mode=ImageRefMode.EMBEDDED,
+            labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
+            split_page_view=True,
+        )
+        print(f" => produced {out_path / fname}.html")
+
+        pg_num = res.document.num_pages()
+        print("")
+        print(
+            f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
+        )
+        print("====================================================")
+
+        return [
+            source,
+            model_id,
+            str(framework),
+            pg_num,
+            inference_time,
+        ]
+
+
+if __name__ == "__main__":
+    sources = [
+        "tests/data/pdf/2305.03393v1-pg9.pdf",
+    ]
+
+    out_path = Path("scratch")
+    out_path.mkdir(parents=True, exist_ok=True)
+
+    ## Use VlmPipeline
+    pipeline_options = VlmPipelineOptions()
+    pipeline_options.generate_page_images = True
+
+    ## On GPU systems, enable flash_attention_2 with CUDA:
+    # pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
+    # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
+
+    vlm_models = [
+        ## DocTags / SmolDocling models
+        vlm_model_specs.SMOLDOCLING_MLX,
+        vlm_model_specs.SMOLDOCLING_TRANSFORMERS,
+        ## Markdown models (using MLX framework)
+        vlm_model_specs.QWEN25_VL_3B_MLX,
+        vlm_model_specs.PIXTRAL_12B_MLX,
+        vlm_model_specs.GEMMA3_12B_MLX,
+        ## Markdown models (using Transformers framework)
+        vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
+        vlm_model_specs.PHI4_TRANSFORMERS,
+        vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
+    ]
+
+    # Remove MLX models if not on Mac
+    if sys.platform != "darwin":
+        vlm_models = [
+            m for m in vlm_models if m.inference_framework != InferenceFramework.MLX
+        ]
+
+    rows = []
+    for vlm_options in vlm_models:
+        pipeline_options.vlm_options = vlm_options
+
+        ## Set up pipeline for PDF or image inputs
+        converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_cls=VlmPipeline,
+                    pipeline_options=pipeline_options,
+                ),
+                InputFormat.IMAGE: PdfFormatOption(
+                    pipeline_cls=VlmPipeline,
+                    pipeline_options=pipeline_options,
+                ),
+            },
+        )
+
+        row = convert(sources=sources, converter=converter)
+        rows.append(row)
+
+        print(
+            tabulate(
+                rows, headers=["source", "model_id", "framework", "num_pages", "time"]
+            )
+        )
+
+        print("see if memory gets released ...")
+        time.sleep(10)
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -3,10 +3,9 @@ import logging
 import time
 from pathlib import Path

+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
    PdfPipelineOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -1,101 +1,46 @@
-import json
-import time
-from pathlib import Path
-
-from docling_core.types.doc import DocItemLabel, ImageRefMode
-from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
-
+from docling.datamodel import vlm_model_specs
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
-    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

-sources = [
-    # "tests/data/2305.03393v1-pg9-img.png",
-    "tests/data/pdf/2305.03393v1-pg9.pdf",
-]
+source = "https://arxiv.org/pdf/2501.17887"

-## Use experimental VlmPipeline
-pipeline_options = VlmPipelineOptions()
-# If force_backend_text = True, text from backend will be used instead of generated text
-pipeline_options.force_backend_text = False
+###### USING SIMPLE DEFAULT VALUES
+# - SmolDocling model
+# - Using the transformers framework

-## On GPU systems, enable flash_attention_2 with CUDA:
-# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
-# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
+converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_cls=VlmPipeline,
+        ),
+    }
+)

-## Pick a VLM model. We choose SmolDocling-256M by default
-# pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+doc = converter.convert(source=source).document

-## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
-pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
+print(doc.export_to_markdown())

-## Alternative VLM models:
-# pipeline_options.vlm_options = granite_vision_vlm_conversion_options

-## Set up pipeline for PDF or image inputs
+###### USING MACOS MPS ACCELERATOR
+# For more options see the compare_vlm_models.py example.
+
+pipeline_options = VlmPipelineOptions(
+    vlm_options=vlm_model_specs.SMOLDOCLING_MLX,
+)
+
 converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
-        InputFormat.IMAGE: PdfFormatOption(
-            pipeline_cls=VlmPipeline,
-            pipeline_options=pipeline_options,
-        ),
    }
 )

-out_path = Path("scratch")
-out_path.mkdir(parents=True, exist_ok=True)
+doc = converter.convert(source=source).document

-for source in sources:
-    start_time = time.time()
-    print("================================================")
-    print(f"Processing... {source}")
-    print("================================================")
-    print("")
-
-    res = converter.convert(source)
-
-    print("")
-    print(res.document.export_to_markdown())
-
-    for page in res.pages:
-        print("")
-        print("Predicted page in DOCTAGS:")
-        print(page.predictions.vlm_response.text)
-
-    res.document.save_as_html(
-        filename=Path(f"{out_path}/{res.input.file.stem}.html"),
-        image_mode=ImageRefMode.REFERENCED,
-        labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
-    )
-
-    with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
-        fp.write(json.dumps(res.document.export_to_dict()))
-
-    res.document.save_as_json(
-        out_path / f"{res.input.file.stem}.json",
-        image_mode=ImageRefMode.PLACEHOLDER,
-    )
-
-    res.document.save_as_markdown(
-        out_path / f"{res.input.file.stem}.md",
-        image_mode=ImageRefMode.PLACEHOLDER,
-    )
-
-    pg_num = res.document.num_pages()
-    print("")
-    inference_time = time.time() - start_time
-    print(
-        f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
-    )
-
-print("================================================")
-print("done!")
-print("================================================")
+print(doc.export_to_markdown())
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@ -1,9 +1,8 @@
 from pathlib import Path

+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
    PdfPipelineOptions,
 )
 from docling.datamodel.settings import settings
--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@ -1,5 +1,4 @@
 import logging
-import time
 from pathlib import Path

 from docling_core.types.doc import ImageRefMode, TableItem, TextItem
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@ -7,10 +7,9 @@ from dotenv import load_dotenv

 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
-    ApiVlmOptions,
-    ResponseFormat,
    VlmPipelineOptions,
 )
+from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@ -44,6 +44,23 @@ This is a collection of FAQ collected from the user questions on <https://github
    Source: Issue [#283](https://github.com/docling-project/docling/issues/283#issuecomment-2465035868)


+??? question "Is macOS x86_64 supported?"
+
+    ### Is macOS x86_64 supported?
+
+    Yes, Docling (still) supports running the standard pipeline on macOS x86_64.
+
+    However, users might get into a combination of incompatible dependencies on a fresh install.
+    Because Docling depends on PyTorch which dropped support for macOS x86_64 after the 2.2.2 release,
+    and this old version of PyTorch works only with NumPy 1.x, users **must** ensure the correct NumPy version is running.
+
+    ```shell
+    pip install docling "numpy<2.0.0"
+    ```
+
+    Source: Issue [#1694](https://github.com/docling-project/docling/issues/1694).
+
+
 ??? question "Are text styles (bold, underline, etc) supported?"

    ### Are text styles (bold, underline, etc) supported?
--- a/docs/index.md
+++ b/docs/index.md
@ -6,13 +6,13 @@
 [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
-[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
-[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
+[![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
+[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
 [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
+[![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
 [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
 [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)

@ -27,7 +27,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 🔒 Local execution capabilities for sensitive data and air-gapped environments
 * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕🔥
+* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🔥
 * 💻 Simple and convenient CLI

 ### Coming soon
@ -39,7 +39,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ## Get started

 <div class="grid">
-  <a href="concepts/" class="card"><b>Concepts</b><br />Learn Docling fundamendals</a>
+  <a href="concepts/" class="card"><b>Concepts</b><br />Learn Docling fundamentals</a>
  <a href="examples/" class="card"><b>Examples</b><br />Try out recipes for various use cases, including conversion, RAG, and more</a>
  <a href="integrations/" class="card"><b>Integrations</b><br />Check out integrations with popular frameworks and tools</a>
  <a href="reference/document_converter/" class="card"><b>Reference</b><br />See more API details</a>
--- a/docs/installation/index.md
+++ b/docs/installation/index.md
@ -129,5 +129,5 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
 To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:

 ```bash
-poetry install --all-extras
+uv sync --all-extras
 ```
--- a/docs/usage/vision_models.md
+++ b/docs/usage/vision_models.md
@ -0,0 +1,121 @@
+
+The `VlmPipeline` in Docling allows to convert documents end-to-end using a vision-language model.
+
+Docling supports vision-language models which output:
+
+- DocTags (e.g. [SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)), the preferred choice
+- Markdown
+- HTML
+
+
+For running Docling using local models with the `VlmPipeline`:
+
+=== "CLI"
+
+    ```bash
+    docling --pipeline vlm FILE
+    ```
+
+=== "Python"
+
+    See also the example [minimal_vlm_pipeline.py](./../examples/minimal_vlm_pipeline.py).
+
+    ```python
+    from docling.datamodel.base_models import InputFormat
+    from docling.document_converter import DocumentConverter, PdfFormatOption
+    from docling.pipeline.vlm_pipeline import VlmPipeline
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_cls=VlmPipeline,
+            ),
+        }
+    )
+
+    doc = converter.convert(source="FILE").document
+    ```
+
+## Available local models
+
+By default, the vision-language models are running locally.
+Docling allows to choose between the Hugging Face [Transformers](https://github.com/huggingface/transformers) framweork and the [MLX](https://github.com/Blaizzy/mlx-vlm) (for Apple devices with MPS acceleration) one.
+
+The following table reports the models currently available out-of-the-box.
+
+| Model instance | Model | Framework | Device | Num pages | Inference time (sec) |
+| ---------------|------ | --------- | ------ | --------- | ---------------------|
+| `vlm_model_specs.SMOLDOCLING_TRANSFORMERS` | [ds4sd/SmolDocling-256M-preview](https://huggingface.co/ds4sd/SmolDocling-256M-preview) | `Transformers/AutoModelForVision2Seq` | MPS | 1 |  102.212 |
+| `vlm_model_specs.SMOLDOCLING_MLX` | [ds4sd/SmolDocling-256M-preview-mlx-bf16](https://huggingface.co/ds4sd/SmolDocling-256M-preview-mlx-bf16) | `MLX`| MPS | 1 |    6.15453 |
+| `vlm_model_specs.QWEN25_VL_3B_MLX` | [mlx-community/Qwen2.5-VL-3B-Instruct-bf16](https://huggingface.co/mlx-community/Qwen2.5-VL-3B-Instruct-bf16)  |  `MLX`| MPS | 1 |   23.4951 |
+| `vlm_model_specs.PIXTRAL_12B_MLX` | [mlx-community/pixtral-12b-bf16](https://huggingface.co/mlx-community/pixtral-12b-bf16) |  `MLX` | MPS | 1 |  308.856 |
+| `vlm_model_specs.GEMMA3_12B_MLX` | [mlx-community/gemma-3-12b-it-bf16](https://huggingface.co/mlx-community/gemma-3-12b-it-bf16) |  `MLX` | MPS | 1 |  378.486 |
+| `vlm_model_specs.GRANITE_VISION_TRANSFORMERS` | [ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b) | `Transformers/AutoModelForVision2Seq` | MPS | 1 |  104.75 |
+| `vlm_model_specs.PHI4_TRANSFORMERS` | [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) | `Transformers/AutoModelForCasualLM` | CPU | 1 | 1175.67 |
+| `vlm_model_specs.PIXTRAL_12B_TRANSFORMERS` | [mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b) | `Transformers/AutoModelForVision2Seq` | CPU | 1 | 1828.21 |
+
+_Inference time is computed on a Macbook M3 Max using the example page `tests/data/pdf/2305.03393v1-pg9.pdf`. The comparision is done with the example [compare_vlm_models.py](./../examples/compare_vlm_models.py)._
+
+For choosing the model, the code snippet above can be extended as follow
+
+```python
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.pipeline.vlm_pipeline import VlmPipeline
+from docling.datamodel.pipeline_options import (
+    VlmPipelineOptions,
+)
+from docling.datamodel import vlm_model_specs
+
+pipeline_options = VlmPipelineOptions(
+    vlm_options=vlm_model_specs.SMOLDOCLING_MLX,  # <-- change the model here
+)
+
+converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_cls=VlmPipeline,
+            pipeline_options=pipeline_options,
+        ),
+    }
+)
+
+doc = converter.convert(source="FILE").document
+```
+
+### Other models
+
+Other models can be configured by directly providing the Hugging Face `repo_id`, the prompt and a few more options.
+
+For example:
+
+```python
+from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions, InferenceFramework, TransformersModelType
+
+pipeline_options = VlmPipelineOptions(
+    vlm_options=InlineVlmOptions(
+        repo_id="ibm-granite/granite-vision-3.2-2b",
+        prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+        response_format=ResponseFormat.MARKDOWN,
+        inference_framework=InferenceFramework.TRANSFORMERS,
+        transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+        supported_devices=[
+            AcceleratorDevice.CPU,
+            AcceleratorDevice.CUDA,
+            AcceleratorDevice.MPS,
+        ],
+        scale=2.0,
+        temperature=0.0,
+    )
+)
+```
+
+
+## Remote models
+
+Additionally to local models, the `VlmPipeline` allows to offload the inference to a remote service hosting the models.
+Many remote inference services are provided, the key requirement is to offer an OpenAI-compatible API. This includes vLLM, Ollama, etc.
+
+More examples on how to connect with the remote inference services can be found in the following examples:
+
+- [vlm_pipeline_api_model.py](./../examples/vlm_pipeline_api_model.py)
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -60,6 +60,7 @@ nav:
      - Usage: usage/index.md
      - Supported formats: usage/supported_formats.md
      - Enrichment features: usage/enrichments.md
+      - Vision models: usage/vision_models.md
    - FAQ:
      - FAQ: faq/index.md
  - Concepts:
@ -78,6 +79,7 @@ nav:
      - "Multi-format conversion": examples/run_with_formats.py
      - "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
      - "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
+      - "VLM comparison": examples/compare_vlm_models.py
      - "Figure export": examples/export_figures.py
      - "Table export": examples/export_tables.py
      - "Multimodal export": examples/export_multimodal.py
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,20 +1,8 @@
-[tool.poetry]
+[project]
 name = "docling"
-version = "2.34.0"  # DO NOT EDIT, updated automatically
+version = "2.36.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
-authors = [
-  "Christoph Auer <cau@zurich.ibm.com>",
-  "Michele Dolfi <dol@zurich.ibm.com>",
-  "Maxim Lysak <mly@zurich.ibm.com>",
-  "Nikos Livathinos <nli@zurich.ibm.com>",
-  "Ahmed Nassar <ahn@zurich.ibm.com>",
-  "Panos Vagenas <pva@zurich.ibm.com>",
-  "Peter Staar <taa@zurich.ibm.com>",
-]
 license = "MIT"
-readme = "README.md"
-repository = "https://github.com/docling-project/docling"
-homepage = "https://github.com/docling-project/docling"
 keywords = [
  "docling",
  "convert",
@ -29,144 +17,137 @@ keywords = [
  "table former",
 ]
 classifiers = [
-  "License :: OSI Approved :: MIT License",
  "Operating System :: MacOS :: MacOS X",
  "Operating System :: POSIX :: Linux",
+  "Operating System :: Microsoft :: Windows",
  "Development Status :: 5 - Production/Stable",
  "Intended Audience :: Developers",
  "Intended Audience :: Science/Research",
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
 ]
-packages = [{ include = "docling" }]
-
-[tool.poetry.dependencies]
-######################
-# actual dependencies:
-######################
-python = "^3.9"
-pydantic = "^2.0.0"
-docling-core = {version = "^2.31.2", extras = ["chunking"]}
-docling-ibm-models = "^3.4.0"
-docling-parse = "^4.0.0"
-filetype = "^1.2.0"
-pypdfium2 = "^4.30.0"
-pydantic-settings = "^2.3.0"
-huggingface_hub = ">=0.23,<1"
-requests = "^2.32.2"
-easyocr = "^1.7"
-tesserocr = { version = "^2.7.1", optional = true }
-certifi = ">=2024.7.4"
-rtree = "^1.3.0"
-scipy = [
-  { version = "^1.6.0", markers = "python_version >= '3.10'" },
-  { version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" },
+readme = "README.md"
+authors = [
+  { name = "Christoph Auer", email = "cau@zurich.ibm.com" },
+  { name = "Michele Dolfi", email = "dol@zurich.ibm.com" },
+  { name = "Maxim Lysak", email = "mly@zurich.ibm.com" },
+  { name = "Nikos Livathinos", email = "nli@zurich.ibm.com" },
+  { name = "Ahmed Nassar", email = "ahn@zurich.ibm.com" },
+  { name = "Panos Vagenas", email = "pva@zurich.ibm.com" },
+  { name = "Peter Staar", email = "taa@zurich.ibm.com" },
 ]
-typer = ">=0.12.5,<0.16.0"
-python-docx = "^1.1.2"
-python-pptx = "^1.0.2"
-beautifulsoup4 = "^4.12.3"
-pandas = "^2.1.4"
-marko = "^2.1.2"
-openpyxl = "^3.1.5"
-lxml = ">=4.0.0,<6.0.0"
-ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
-rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
-onnxruntime = [
-  # 1.19.2 is the last version with python3.9 support,
-  # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
-  { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
-  { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" },
+requires-python = '>=3.9,<4.0'
+dependencies = [
+  'pydantic (>=2.0.0,<3.0.0)',
+  'docling-core[chunking] (>=2.29.0,<3.0.0)',
+  'docling-ibm-models (>=3.4.4,<4.0.0)',
+  'docling-parse (>=4.0.0,<5.0.0)',
+  'filetype (>=1.2.0,<2.0.0)',
+  'pypdfium2 (>=4.30.0,<5.0.0)',
+  'pydantic-settings (>=2.3.0,<3.0.0)',
+  'huggingface_hub (>=0.23,<1)',
+  'requests (>=2.32.2,<3.0.0)',
+  'easyocr (>=1.7,<2.0)',
+  'certifi (>=2024.7.4)',
+  'rtree (>=1.3.0,<2.0.0)',
+  'typer (>=0.12.5,<0.16.0)',
+  'python-docx (>=1.1.2,<2.0.0)',
+  'python-pptx (>=1.0.2,<2.0.0)',
+  'beautifulsoup4 (>=4.12.3,<5.0.0)',
+  'pandas (>=2.1.4,<3.0.0)',
+  'marko (>=2.1.2,<3.0.0)',
+  'openpyxl (>=3.1.5,<4.0.0)',
+  'lxml (>=4.0.0,<6.0.0)',
+  'pillow (>=10.0.0,<12.0.0)',
+  'tqdm (>=4.65.0,<5.0.0)',
+  'pluggy (>=1.0.0,<2.0.0)',
+  'pylatexenc (>=2.10,<3.0)',
+  'click (<8.2.0)',
+  'scipy (>=1.6.0,<2.0.0)',
+  # 'scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"',
+  # 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
 ]

-transformers = [
-  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
-  { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true },
-]
-accelerate = [
-  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
-]
-pillow = ">=10.0.0,<12.0.0"
-tqdm = "^4.65.0"
-pluggy = "^1.0.0"
-pylatexenc = "^2.10"
-click = "<8.2.0"
+[project.urls]
+homepage = "https://github.com/docling-project/docling"
+repository = "https://github.com/docling-project/docling"
+issues = "https://github.com/docling-project/docling/issues"
+changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md"

-[tool.poetry.group.dev.dependencies]
-python = "^3.9.2"
-black = { extras = ["jupyter"], version = "^24.4.2" }
-pytest = "^7.2.2"
-pre-commit = "^3.7.1"
-mypy = "^1.10.1"
-isort = "^5.10.1"
-python-semantic-release = "^7.32.2"
-flake8 = "^6.0.0"
-pyproject-flake8 = "^6.0.0"
-pytest-xdist = "^3.3.1"
-types-requests = "^2.31.0.2"
-flake8-pyproject = "^1.2.3"
-pylint = "^2.17.5"
-pandas-stubs = "^2.1.4.231227"
-ipykernel = "^6.29.5"
-ipywidgets = "^8.1.5"
-nbqa = "^1.9.0"
-types-openpyxl = "^3.1.5.20241114"
-types-tqdm = "^4.67.0.20241221"
-coverage = "^7.6.2"
-pytest-cov = "^6.0.0"
+[project.entry-points.docling]
+"docling_defaults" = "docling.models.plugins.defaults"

-[tool.poetry.group.docs.dependencies]
-mkdocs-material = "^9.5.40"
-mkdocs-jupyter = "^0.25.0"
-mkdocs-click = "^0.8.1"
-mkdocstrings = { extras = ["python"], version = "^0.27.0" }
-griffe-pydantic = "^1.1.0"
-
-[tool.poetry.group.examples.dependencies]
-datasets = "^2.21.0"
-python-dotenv = "^1.0.1"
-langchain-huggingface = "^0.0.3"
-langchain-milvus = "^0.1.4"
-langchain-text-splitters = "^0.2.4"
-
-[tool.poetry.group.constraints]
-optional = true
-
-[tool.poetry.group.constraints.dependencies]
-numpy = [
-  { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
-  { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
-]
-
-[tool.poetry.group.mac_intel]
-optional = true
-
-[tool.poetry.group.mac_intel.dependencies]
-torch = [
-  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2" },
-  { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2" },
-]
-torchvision = [
-  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0" },
-  { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2" },
-]
-
-[tool.poetry.extras]
-tesserocr = ["tesserocr"]
-ocrmac = ["ocrmac"]
-vlm = ["transformers", "accelerate"]
-rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
-
-[tool.poetry.scripts]
+[project.scripts]
 docling = "docling.cli.main:app"
 docling-tools = "docling.cli.tools:app"

-[tool.poetry.plugins."docling"]
-"docling_defaults" = "docling.models.plugins.defaults"
+[project.optional-dependencies]
+tesserocr = ['tesserocr (>=2.7.1,<3.0.0)']
+ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"']
+vlm = [
+  'transformers (>=4.46.0,<5.0.0)',
+  'accelerate (>=1.2.1,<2.0.0)',
+  'mlx-vlm >=0.1.22 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
+]
+rapidocr = [
+  'rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; python_version < "3.13"',
+  'onnxruntime (>=1.7.0,<2.0.0)',
+  # 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
+  # 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
+]

-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
+[dependency-groups]
+dev = [
+    "pre-commit~=3.7",
+    "mypy~=1.10",
+    "types-setuptools~=70.3",
+    "pandas-stubs~=2.1",
+    "types-openpyxl~=3.1",
+    "types-requests~=2.31",
+    "boto3-stubs~=1.37",
+    "types-urllib3~=1.26",
+    "types-tqdm~=4.67",
+    "coverage~=7.6",
+    "pytest~=8.3",
+    "pytest-cov>=6.1.1",
+    "pytest-dependency~=0.6",
+    "pytest-xdist~=3.3",
+    "ipykernel~=6.29",
+    "ipywidgets~=8.1",
+    "nbqa~=1.9",
+    "python-semantic-release~=7.32",
+]
+docs = [
+  "mkdocs-material~=9.5",
+  "mkdocs-jupyter~=0.25",
+  "mkdocs-click~=0.8",
+  "mkdocstrings[python]~=0.27",
+  "griffe-pydantic~=1.1",
+]
+examples = [
+  "datasets~=2.21",
+  "python-dotenv~=1.0",
+  "langchain-huggingface>=0.0.3",
+  "langchain-milvus~=0.1",
+  "langchain-text-splitters~=0.2",
+]
+constraints = [
+  'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
+  'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
+]
+
+
+[tool.uv]
+package = true
+default-groups = "all"
+
+[tool.setuptools.packages.find]
+include = ["docling*"]

 [tool.ruff]
 target-version = "py39"
@ -182,51 +163,51 @@ skip-magic-trailing-comma = false

 [tool.ruff.lint]
 select = [
-    # "B",  # flake8-bugbear
-    "C",  # flake8-comprehensions
-    "C9",  # mccabe
-    # "D",  # flake8-docstrings
-    "E",  # pycodestyle errors (default)
-    "F",  # pyflakes (default)
-    "I",  # isort
-    "PD", # pandas-vet
-    "PIE", # pie
-    # "PTH", # pathlib
-    "Q",  # flake8-quotes
-    # "RET", # return
-    "RUF", # Enable all ruff-specific checks
-    # "SIM", # simplify
-    "S307", # eval
-    # "T20",  # (disallow print statements) keep debugging statements out of the codebase
-    "W",  # pycodestyle warnings
-    "ASYNC", # async
-    "UP", # pyupgrade
+  # "B",  # flake8-bugbear
+  "C",  # flake8-comprehensions
+  "C9", # mccabe
+  # "D",  # flake8-docstrings
+  "E",   # pycodestyle errors (default)
+  "F",   # pyflakes (default)
+  "I",   # isort
+  "PD",  # pandas-vet
+  "PIE", # pie
+  # "PTH", # pathlib
+  "Q", # flake8-quotes
+  # "RET", # return
+  "RUF", # Enable all ruff-specific checks
+  # "SIM", # simplify
+  "S307", # eval
+  # "T20",  # (disallow print statements) keep debugging statements out of the codebase
+  "W",     # pycodestyle warnings
+  "ASYNC", # async
+  "UP",    # pyupgrade
 ]

 ignore = [
-    "C408",  # Unnecessary `dict()` call (rewrite as a literal)
-    "E501",  # Line too long, handled by ruff formatter
-    "D107", # "Missing docstring in __init__",
-    "F401",  # imported but unused; consider using `importlib.util.find_spec` to test for "
-    "F811", # "redefinition of the same function"
-    "PL", # Pylint
-    "RUF012", # Mutable Class Attributes
-    "UP006",  # List vs list, etc
-    "UP007", # Option and Union
-    "UP035",  # `typing.Set` is deprecated, use `set` instead"
+  "C408",   # Unnecessary `dict()` call (rewrite as a literal)
+  "E501",   # Line too long, handled by ruff formatter
+  "D107",   # "Missing docstring in __init__",
+  "F401",   # imported but unused; consider using `importlib.util.find_spec` to test for "
+  "F811",   # "redefinition of the same function"
+  "PL",     # Pylint
+  "RUF012", # Mutable Class Attributes
+  "UP006",  # List vs list, etc
+  "UP007",  # Option and Union
+  "UP035",  # `typing.Set` is deprecated, use `set` instead"
 ]

 #extend-select = []

 [tool.ruff.lint.pep8-naming]
 classmethod-decorators = [
-    # Allow Pydantic's `@validator` decorator to trigger class method treatment.
-    "pydantic.validator",
+  # Allow Pydantic's `@validator` decorator to trigger class method treatment.
+  "pydantic.validator",
 ]

 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["E402", "F401"]
-"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
+"tests/*.py" = ["ASYNC"]         # Disable ASYNC check for tests

 [tool.ruff.lint.mccabe]
 max-complexity = 20
--- a/tests/data/docx/textbox.docx
+++ b/tests/data/docx/textbox.docx
--- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt
@ -26,69 +26,71 @@ item-0 at level 0: unspecified: group _root_
  item-21 at level 1: paragraph: 
  item-22 at level 1: paragraph: 
  item-23 at level 1: section: group textbox
-    item-24 at level 2: paragraph:   A report must be submitted wi ... saster Prevention Information Network.
-    item-25 at level 2: paragraph:   A report must also be submitt ... d Infectious Disease Reporting System.
-    item-26 at level 2: paragraph: 
+    item-24 at level 2: list: group list
+      item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
+      item-26 at level 3: list_item: A report must also be submitted  ... d Infectious Disease Reporting System.
    item-27 at level 2: paragraph: 
-  item-28 at level 1: paragraph: 
-  item-29 at level 1: paragraph: 
-  item-30 at level 1: paragraph: 
+    item-28 at level 2: paragraph: 
+  item-29 at level 1: list: group list
+    item-30 at level 2: list_item: 
  item-31 at level 1: paragraph: 
  item-32 at level 1: paragraph: 
  item-33 at level 1: paragraph: 
-  item-34 at level 1: section: group textbox
-    item-35 at level 2: paragraph: Health Bureau:
-    item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
-    item-37 at level 2: list: group list
-      item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
-      item-39 at level 3: list_item: Implement appropriate epidemic p ...  the Communicable Disease Control Act.
-    item-40 at level 2: paragraph: 
-    item-41 at level 2: paragraph: 
-  item-42 at level 1: list: group list
-    item-43 at level 2: list_item: 
-  item-44 at level 1: paragraph: 
-  item-45 at level 1: section: group textbox
-    item-46 at level 2: paragraph: Department of Education:
+  item-34 at level 1: paragraph: 
+  item-35 at level 1: paragraph: 
+  item-36 at level 1: section: group textbox
+    item-37 at level 2: paragraph: Health Bureau:
+    item-38 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
+    item-39 at level 2: list: group list
+      item-40 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
+      item-41 at level 3: list_item: Implement appropriate epidemic p ...  the Communicable Disease Control Act.
+    item-42 at level 2: paragraph: 
+    item-43 at level 2: paragraph: 
+  item-44 at level 1: list: group list
+    item-45 at level 2: list_item: 
+  item-46 at level 1: paragraph: 
+  item-47 at level 1: section: group textbox
+    item-48 at level 2: paragraph: Department of Education:
 Collabo ... vention measures at all school levels.
-  item-47 at level 1: paragraph: 
-  item-48 at level 1: paragraph: 
  item-49 at level 1: paragraph: 
  item-50 at level 1: paragraph: 
  item-51 at level 1: paragraph: 
  item-52 at level 1: paragraph: 
  item-53 at level 1: paragraph: 
-  item-54 at level 1: section: group textbox
-    item-55 at level 2: inline: group group
-      item-56 at level 3: paragraph: The Health Bureau will handle
-      item-57 at level 3: paragraph: reporting and specimen collection
-      item-58 at level 3: paragraph: .
-    item-59 at level 2: paragraph: 
-    item-60 at level 2: paragraph: 
-  item-61 at level 1: paragraph: 
-  item-62 at level 1: paragraph: 
+  item-54 at level 1: paragraph: 
+  item-55 at level 1: paragraph: 
+  item-56 at level 1: section: group textbox
+    item-57 at level 2: inline: group group
+      item-58 at level 3: paragraph: The Health Bureau will handle
+      item-59 at level 3: paragraph: reporting and specimen collection
+      item-60 at level 3: paragraph: .
+    item-61 at level 2: paragraph: 
+    item-62 at level 2: paragraph: 
  item-63 at level 1: paragraph: 
-  item-64 at level 1: section: group textbox
-    item-65 at level 2: paragraph: Whether the epidemic has eased.
-    item-66 at level 2: paragraph: 
-    item-67 at level 2: paragraph: 
-  item-68 at level 1: paragraph: 
-  item-69 at level 1: section: group textbox
-    item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
-    item-71 at level 2: paragraph: No
-  item-72 at level 1: paragraph: 
-  item-73 at level 1: paragraph: 
-  item-74 at level 1: section: group textbox
+  item-64 at level 1: paragraph: 
+  item-65 at level 1: paragraph: 
+  item-66 at level 1: section: group textbox
+    item-67 at level 2: paragraph: Whether the epidemic has eased.
+    item-68 at level 2: paragraph: 
+    item-69 at level 2: paragraph: 
+  item-70 at level 1: paragraph: 
+  item-71 at level 1: section: group textbox
+    item-72 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
+    item-73 at level 2: paragraph: No
+  item-74 at level 1: paragraph: 
  item-75 at level 1: paragraph: 
  item-76 at level 1: section: group textbox
  item-77 at level 1: paragraph: 
-  item-78 at level 1: paragraph: 
-  item-79 at level 1: section: group textbox
-    item-80 at level 2: paragraph: Case closed.
-    item-81 at level 2: paragraph: 
-    item-82 at level 2: paragraph: 
-    item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
-  item-84 at level 1: paragraph: 
-  item-85 at level 1: section: group textbox
+  item-78 at level 1: section: group textbox
+  item-79 at level 1: paragraph: 
+  item-80 at level 1: paragraph: 
+  item-81 at level 1: section: group textbox
+    item-82 at level 2: paragraph: Case closed.
+    item-83 at level 2: paragraph: 
+    item-84 at level 2: paragraph: 
+    item-85 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
  item-86 at level 1: paragraph: 
-  item-87 at level 1: paragraph: 
-  item-88 at level 1: paragraph: 
+  item-87 at level 1: section: group textbox
+  item-88 at level 1: paragraph: 
+  item-89 at level 1: paragraph: 
+  item-90 at level 1: paragraph: 
--- a/tests/data/groundtruth/docling_v2/textbox.docx.json
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.json
@ -4,7 +4,7 @@
  "name": "textbox",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-    "binary_hash": 830302052279341882,
+    "binary_hash": 11723995438039370060,
    "filename": "textbox.docx"
  },
  "furniture": {
@ -66,7 +66,7 @@
        "$ref": "#/groups/4"
      },
      {
-        "$ref": "#/texts/22"
+        "$ref": "#/groups/6"
      },
      {
        "$ref": "#/texts/23"
@ -84,16 +84,16 @@
        "$ref": "#/texts/27"
      },
      {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      {
-        "$ref": "#/groups/7"
+        "$ref": "#/groups/9"
      },
      {
        "$ref": "#/texts/35"
      },
      {
-        "$ref": "#/groups/8"
+        "$ref": "#/groups/10"
      },
      {
        "$ref": "#/texts/37"
@ -117,7 +117,7 @@
        "$ref": "#/texts/43"
      },
      {
-        "$ref": "#/groups/9"
+        "$ref": "#/groups/11"
      },
      {
        "$ref": "#/texts/49"
@ -129,13 +129,13 @@
        "$ref": "#/texts/51"
      },
      {
-        "$ref": "#/groups/11"
+        "$ref": "#/groups/13"
      },
      {
        "$ref": "#/texts/55"
      },
      {
-        "$ref": "#/groups/12"
+        "$ref": "#/groups/14"
      },
      {
        "$ref": "#/texts/58"
@ -144,13 +144,13 @@
        "$ref": "#/texts/59"
      },
      {
-        "$ref": "#/groups/13"
+        "$ref": "#/groups/15"
      },
      {
        "$ref": "#/texts/60"
      },
      {
-        "$ref": "#/groups/14"
+        "$ref": "#/groups/16"
      },
      {
        "$ref": "#/texts/61"
@ -159,13 +159,13 @@
        "$ref": "#/texts/62"
      },
      {
-        "$ref": "#/groups/15"
+        "$ref": "#/groups/17"
      },
      {
        "$ref": "#/texts/67"
      },
      {
-        "$ref": "#/groups/16"
+        "$ref": "#/groups/18"
      },
      {
        "$ref": "#/texts/68"
@ -254,10 +254,7 @@
      },
      "children": [
        {
-          "$ref": "#/texts/18"
-        },
-        {
-          "$ref": "#/texts/19"
+          "$ref": "#/groups/5"
        },
        {
          "$ref": "#/texts/20"
@ -272,6 +269,37 @@
    },
    {
      "self_ref": "#/groups/5",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/18"
+        },
+        {
+          "$ref": "#/texts/19"
+        }
+      ],
+      "content_layer": "body",
+      "name": "list",
+      "label": "list"
+    },
+    {
+      "self_ref": "#/groups/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/22"
+        }
+      ],
+      "content_layer": "body",
+      "name": "list",
+      "label": "list"
+    },
+    {
+      "self_ref": "#/groups/7",
      "parent": {
        "$ref": "#/body"
      },
@ -283,7 +311,7 @@
          "$ref": "#/texts/29"
        },
        {
-          "$ref": "#/groups/6"
+          "$ref": "#/groups/8"
        },
        {
          "$ref": "#/texts/32"
@ -297,9 +325,9 @@
      "label": "section"
    },
    {
-      "self_ref": "#/groups/6",
+      "self_ref": "#/groups/8",
      "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      "children": [
        {
@ -314,7 +342,7 @@
      "label": "list"
    },
    {
-      "self_ref": "#/groups/7",
+      "self_ref": "#/groups/9",
      "parent": {
        "$ref": "#/body"
      },
@ -328,7 +356,7 @@
      "label": "list"
    },
    {
-      "self_ref": "#/groups/8",
+      "self_ref": "#/groups/10",
      "parent": {
        "$ref": "#/body"
      },
@ -342,13 +370,13 @@
      "label": "section"
    },
    {
-      "self_ref": "#/groups/9",
+      "self_ref": "#/groups/11",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
-          "$ref": "#/groups/10"
+          "$ref": "#/groups/12"
        },
        {
          "$ref": "#/texts/47"
@ -362,9 +390,9 @@
      "label": "section"
    },
    {
-      "self_ref": "#/groups/10",
+      "self_ref": "#/groups/12",
      "parent": {
-        "$ref": "#/groups/9"
+        "$ref": "#/groups/11"
      },
      "children": [
        {
@ -382,7 +410,7 @@
      "label": "inline"
    },
    {
-      "self_ref": "#/groups/11",
+      "self_ref": "#/groups/13",
      "parent": {
        "$ref": "#/body"
      },
@ -402,7 +430,7 @@
      "label": "section"
    },
    {
-      "self_ref": "#/groups/12",
+      "self_ref": "#/groups/14",
      "parent": {
        "$ref": "#/body"
      },
@ -418,31 +446,31 @@
      "name": "textbox",
      "label": "section"
    },
-    {
-      "self_ref": "#/groups/13",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [],
-      "content_layer": "body",
-      "name": "textbox",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/14",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [],
-      "content_layer": "body",
-      "name": "textbox",
-      "label": "section"
-    },
    {
      "self_ref": "#/groups/15",
      "parent": {
        "$ref": "#/body"
      },
+      "children": [],
+      "content_layer": "body",
+      "name": "textbox",
+      "label": "section"
+    },
+    {
+      "self_ref": "#/groups/16",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "name": "textbox",
+      "label": "section"
+    },
+    {
+      "self_ref": "#/groups/17",
+      "parent": {
+        "$ref": "#/body"
+      },
      "children": [
        {
          "$ref": "#/texts/63"
@ -462,7 +490,7 @@
      "label": "section"
    },
    {
-      "self_ref": "#/groups/16",
+      "self_ref": "#/groups/18",
      "parent": {
        "$ref": "#/body"
      },
@ -732,38 +760,42 @@
    {
      "self_ref": "#/texts/18",
      "parent": {
-        "$ref": "#/groups/4"
+        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "list_item",
      "prov": [],
-      "orig": "  A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
-      "text": "  A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
+      "orig": "A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
+      "text": "A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false
-      }
+      },
+      "enumerated": false,
+      "marker": "-"
    },
    {
      "self_ref": "#/texts/19",
      "parent": {
-        "$ref": "#/groups/4"
+        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "list_item",
      "prov": [],
-      "orig": "  A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
-      "text": "  A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
+      "orig": "A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
+      "text": "A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false
-      }
+      },
+      "enumerated": false,
+      "marker": "-"
    },
    {
      "self_ref": "#/texts/20",
@ -792,14 +824,16 @@
    {
      "self_ref": "#/texts/22",
      "parent": {
-        "$ref": "#/body"
+        "$ref": "#/groups/6"
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "list_item",
      "prov": [],
      "orig": "",
-      "text": ""
+      "text": "",
+      "enumerated": false,
+      "marker": "-"
    },
    {
      "self_ref": "#/texts/23",
@ -864,7 +898,7 @@
    {
      "self_ref": "#/texts/28",
      "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
@ -882,7 +916,7 @@
    {
      "self_ref": "#/texts/29",
      "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
@ -900,7 +934,7 @@
    {
      "self_ref": "#/texts/30",
      "parent": {
-        "$ref": "#/groups/6"
+        "$ref": "#/groups/8"
      },
      "children": [],
      "content_layer": "body",
@ -920,7 +954,7 @@
    {
      "self_ref": "#/texts/31",
      "parent": {
-        "$ref": "#/groups/6"
+        "$ref": "#/groups/8"
      },
      "children": [],
      "content_layer": "body",
@ -940,7 +974,7 @@
    {
      "self_ref": "#/texts/32",
      "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
@ -952,7 +986,7 @@
    {
      "self_ref": "#/texts/33",
      "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
@ -964,7 +998,7 @@
    {
      "self_ref": "#/texts/34",
      "parent": {
-        "$ref": "#/groups/7"
+        "$ref": "#/groups/9"
      },
      "children": [],
      "content_layer": "body",
@ -990,7 +1024,7 @@
    {
      "self_ref": "#/texts/36",
      "parent": {
-        "$ref": "#/groups/8"
+        "$ref": "#/groups/10"
      },
      "children": [],
      "content_layer": "body",
@ -1092,7 +1126,7 @@
    {
      "self_ref": "#/texts/44",
      "parent": {
-        "$ref": "#/groups/10"
+        "$ref": "#/groups/12"
      },
      "children": [],
      "content_layer": "body",
@ -1110,7 +1144,7 @@
    {
      "self_ref": "#/texts/45",
      "parent": {
-        "$ref": "#/groups/10"
+        "$ref": "#/groups/12"
      },
      "children": [],
      "content_layer": "body",
@ -1128,7 +1162,7 @@
    {
      "self_ref": "#/texts/46",
      "parent": {
-        "$ref": "#/groups/10"
+        "$ref": "#/groups/12"
      },
      "children": [],
      "content_layer": "body",
@ -1146,7 +1180,7 @@
    {
      "self_ref": "#/texts/47",
      "parent": {
-        "$ref": "#/groups/9"
+        "$ref": "#/groups/11"
      },
      "children": [],
      "content_layer": "body",
@ -1158,7 +1192,7 @@
    {
      "self_ref": "#/texts/48",
      "parent": {
-        "$ref": "#/groups/9"
+        "$ref": "#/groups/11"
      },
      "children": [],
      "content_layer": "body",
@ -1206,7 +1240,7 @@
    {
      "self_ref": "#/texts/52",
      "parent": {
-        "$ref": "#/groups/11"
+        "$ref": "#/groups/13"
      },
      "children": [],
      "content_layer": "body",
@ -1224,7 +1258,7 @@
    {
      "self_ref": "#/texts/53",
      "parent": {
-        "$ref": "#/groups/11"
+        "$ref": "#/groups/13"
      },
      "children": [],
      "content_layer": "body",
@ -1236,7 +1270,7 @@
    {
      "self_ref": "#/texts/54",
      "parent": {
-        "$ref": "#/groups/11"
+        "$ref": "#/groups/13"
      },
      "children": [],
      "content_layer": "body",
@ -1260,7 +1294,7 @@
    {
      "self_ref": "#/texts/56",
      "parent": {
-        "$ref": "#/groups/12"
+        "$ref": "#/groups/14"
      },
      "children": [],
      "content_layer": "body",
@ -1278,7 +1312,7 @@
    {
      "self_ref": "#/texts/57",
      "parent": {
-        "$ref": "#/groups/12"
+        "$ref": "#/groups/14"
      },
      "children": [],
      "content_layer": "body",
@ -1356,7 +1390,7 @@
    {
      "self_ref": "#/texts/63",
      "parent": {
-        "$ref": "#/groups/15"
+        "$ref": "#/groups/17"
      },
      "children": [],
      "content_layer": "body",
@ -1374,7 +1408,7 @@
    {
      "self_ref": "#/texts/64",
      "parent": {
-        "$ref": "#/groups/15"
+        "$ref": "#/groups/17"
      },
      "children": [],
      "content_layer": "body",
@ -1386,7 +1420,7 @@
    {
      "self_ref": "#/texts/65",
      "parent": {
-        "$ref": "#/groups/15"
+        "$ref": "#/groups/17"
      },
      "children": [],
      "content_layer": "body",
@ -1398,7 +1432,7 @@
    {
      "self_ref": "#/texts/66",
      "parent": {
-        "$ref": "#/groups/15"
+        "$ref": "#/groups/17"
      },
      "children": [],
      "content_layer": "body",
--- a/tests/data/groundtruth/docling_v2/textbox.docx.md
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.md
@ -19,9 +19,8 @@ show the same suggested reportable symptoms

 Yes

-  A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.
-
-  A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.
+- A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.
+- A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.

 **Health Bureau:**

--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@ -1,5 +1,7 @@
 from pathlib import Path

+import pytest
+
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import (
@ -16,6 +18,7 @@ from .verify_utils import verify_document, verify_export
 GENERATE = GEN_TEST_DATA


+@pytest.mark.xfail(strict=False)
 def test_textbox_extraction():
    in_path = Path("tests/data/docx/textbox.docx")
    in_doc = InputDocument(
@ -77,8 +80,7 @@ def get_converter():
    return converter


-def test_e2e_docx_conversions():
-    docx_paths = get_docx_paths()
+def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
    converter = get_converter()

    for docx_path in docx_paths:
@ -115,3 +117,17 @@ def test_e2e_docx_conversions():
                gtfile=str(gt_path) + ".html",
                generate=GENERATE,
            ), "export to html"
+
+
+flaky_path = Path("tests/data/docx/textbox.docx")
+
+
+def test_e2e_docx_conversions():
+    _test_e2e_docx_conversions_impl(
+        docx_paths=[path for path in get_docx_paths() if path != flaky_path]
+    )
+
+
+@pytest.mark.xfail(strict=False)
+def test_textbox_conversion():
+    _test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -1,9 +1,10 @@
 from pathlib import Path

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
+from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption

 from .test_data_gen_flag import GEN_TEST_DATA
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -3,10 +3,10 @@ from pathlib import Path
 from typing import List, Tuple

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
    EasyOcrOptions,
    OcrMacOptions,
    OcrOptions,
--- a/tests/test_options.py
+++ b/tests/test_options.py
@ -7,11 +7,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
    PdfPipelineOptions,
    TableFormerMode,
 )
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -323,33 +323,33 @@ def verify_conversion_result_v1(

    if generate:  # only used when re-generating truth
        pages_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(pages_path, "w") as fw:
+        with open(pages_path, mode="w", encoding="utf-8") as fw:
            fw.write(
                json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
            )

        json_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(json_path, "w") as fw:
+        with open(json_path, mode="w", encoding="utf-8") as fw:
            fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))

        md_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(md_path, "w") as fw:
+        with open(md_path, mode="w", encoding="utf-8") as fw:
            fw.write(doc_pred_md)

        dt_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(dt_path, "w") as fw:
+        with open(dt_path, mode="w", encoding="utf-8") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test
-        with open(pages_path) as fr:
+        with open(pages_path, encoding="utf-8") as fr:
            doc_true_pages = PageList.validate_json(fr.read())

-        with open(json_path) as fr:
+        with open(json_path, encoding="utf-8") as fr:
            doc_true: DsDocument = DsDocument.model_validate_json(fr.read())

-        with open(md_path) as fr:
+        with open(md_path, encoding="utf-8") as fr:
            doc_true_md = fr.read()

-        with open(dt_path) as fr:
+        with open(dt_path, encoding="utf-8") as fr:
            doc_true_dt = fr.read()

        if not fuzzy:
@ -408,33 +408,33 @@ def verify_conversion_result_v2(

    if generate:  # only used when re-generating truth
        pages_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(pages_path, "w") as fw:
+        with open(pages_path, mode="w", encoding="utf-8") as fw:
            fw.write(
                json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
            )

        json_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(json_path, "w") as fw:
+        with open(json_path, mode="w", encoding="utf-8") as fw:
            fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))

        md_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(md_path, "w") as fw:
+        with open(md_path, mode="w", encoding="utf-8") as fw:
            fw.write(doc_pred_md)

        dt_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(dt_path, "w") as fw:
+        with open(dt_path, mode="w", encoding="utf-8") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test
-        with open(pages_path) as fr:
+        with open(pages_path, encoding="utf-8") as fr:
            doc_true_pages = PageList.validate_json(fr.read())

-        with open(json_path) as fr:
+        with open(json_path, encoding="utf-8") as fr:
            doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())

-        with open(md_path) as fr:
+        with open(md_path, encoding="utf-8") as fr:
            doc_true_md = fr.read()

-        with open(dt_path) as fr:
+        with open(dt_path, encoding="utf-8") as fr:
            doc_true_dt = fr.read()

        if not fuzzy:
@ -461,12 +461,12 @@ def verify_conversion_result_v2(

 def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
    if not os.path.exists(gtfile) or generate:
-        with open(gtfile, "w") as fw:
+        with open(gtfile, mode="w", encoding="utf-8") as fw:
            json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)

        return True
    else:
-        with open(gtfile) as fr:
+        with open(gtfile, encoding="utf-8") as fr:
            true_doc = DoclingDocument.model_validate_json(fr.read())

        return verify_docitems(pred_doc, true_doc, fuzzy=False)
@ -476,11 +476,11 @@ def verify_export(pred_text: str, gtfile: str, generate: bool = False) -> bool:
    file = Path(gtfile)

    if not file.exists() or generate:
-        with file.open("w") as fw:
+        with file.open(mode="w", encoding="utf-8") as fw:
            fw.write(pred_text)
        return True

-    with file.open("r") as fr:
+    with file.open(encoding="utf-8") as fr:
        true_text = fr.read()

    return pred_text == true_text
--- a/uv.lock
+++ b/uv.lock