Merge branch 'main' of https://github.com/docling-project/docling into dev/fix_msword_backend_identify_text_after_image

Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-06-05 22:20:09 +02:00 · 2025-06-05 22:20:09 +02:00 · 2bc564ccef
commit 2bc564ccef
parent 93d98dfa63 a2b83fe4ae
67 changed files with 8728 additions and 9731 deletions
--- a/.github/actions/setup-poetry/action.yml
+++ b/.github/actions/setup-poetry/action.yml
@ -1,19 +0,0 @@
 name: 'Set up Poetry and install'
 description: 'Set up a specific version of Poetry and install dependencies using caching.'
 inputs:
  python-version:
    description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax."
    default: '3.11'
 runs:
  using: 'composite'
  steps:
    - name: Install poetry
      run: pipx install poetry==1.8.5
      shell: bash
    - uses: actions/setup-python@v5
      with:
        python-version: ${{ inputs.python-version }}
        cache: 'poetry'
    - name: Install dependencies
      run: poetry install --all-extras
      shell: bash
--- a/.github/scripts/release.sh
+++ b/.github/scripts/release.sh
@ -10,11 +10,12 @@ fi
 CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"
 # update package version
-poetry version "${TARGET_VERSION}"
+uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}"
 UV_FROZEN=0 uv lock --upgrade-package docling
 # collect release notes
 REL_NOTES=$(mktemp)
-poetry run semantic-release changelog --unreleased >> "${REL_NOTES}"
+uv run --no-sync semantic-release changelog --unreleased >> "${REL_NOTES}"
 # update changelog
 TMP_CHGLOG=$(mktemp)
@ -30,7 +31,7 @@ mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"
 # push changes
 git config --global user.name 'github-actions[bot]'
 git config --global user.email 'github-actions[bot]@users.noreply.github.com'
-git add pyproject.toml "${CHGLOG_FILE}"
+git add pyproject.toml uv.lock "${CHGLOG_FILE}"
 COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
 git commit -m "${COMMIT_MSG}"
 git push origin main
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@ -4,8 +4,7 @@ on:
  workflow_dispatch:
 env:
-  # disable keyring (https://github.com/actions/runner-images/issues/6185):
+  UV_FROZEN: "1"
  PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
 jobs:
  code-checks:
@ -20,15 +19,20 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0  # for fetching tags, required for semantic-release
-      - uses: ./.github/actions/setup-poetry
+      - name: Install uv and set the python version
        uses: astral-sh/setup-uv@v5
        with:
          enable-cache: true
      - name: Install dependencies
        run: uv sync --only-dev
      - name: Check version of potential release
        id: version_check
        run: |
-            TRGT_VERSION=$(poetry run semantic-release print-version)
+          TRGT_VERSION=$(uv run --no-sync semantic-release print-version)
-            echo "TRGT_VERSION=${TRGT_VERSION}" >> $GITHUB_OUTPUT
+          echo "TRGT_VERSION=${TRGT_VERSION}" >> "$GITHUB_OUTPUT"
          echo "${TRGT_VERSION}"
      - name: Check notes of potential release
-        run: poetry run semantic-release changelog --unreleased
+        run: uv run --no-sync semantic-release changelog --unreleased
  release:
    needs: [code-checks, pre-release-check]
    if: needs.pre-release-check.outputs.TARGET_TAG_V != ''
@ -45,7 +49,12 @@ jobs:
        with:
          token: ${{ steps.app-token.outputs.token }}
          fetch-depth: 0  # for fetching tags, required for semantic-release
-      - uses: ./.github/actions/setup-poetry
+      - name: Install uv and set the python version
        uses: astral-sh/setup-uv@v5
        with:
          enable-cache: true
      - name: Install dependencies
        run: uv sync --only-dev
      - name: Run release script
        env:
          GH_TOKEN: ${{ steps.app-token.outputs.token }}
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@ -12,6 +12,7 @@ on:
 env:
  HF_HUB_DOWNLOAD_TIMEOUT: "60"
  HF_HUB_ETAG_TIMEOUT: "60"
  UV_FROZEN: "1"
 jobs:
  run-checks:
@ -31,16 +32,24 @@ jobs:
        with:
          path: ~/.cache/huggingface
          key: huggingface-cache-py${{ matrix.python-version }}
-      - uses: ./.github/actions/setup-poetry
+      - name: Install uv and set the python version
        uses: astral-sh/setup-uv@v5
        with:
          python-version: ${{ matrix.python-version }}
-      - name: Run styling check
+          enable-cache: true
-        run: poetry run pre-commit run --all-files
+      - name: pre-commit cache key
-      - name: Install with poetry
+        run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV"
-        run: poetry install --all-extras
+      - uses: actions/cache@v4
        with:
          path: ~/.cache/pre-commit
          key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
      - name: Install dependencies
        run: uv sync --frozen --all-extras
      - name: Check style and run tests
        run: pre-commit run --all-files
      - name: Testing
        run: |
-          poetry run pytest -v --cov=docling --cov-report=xml tests
+          uv run --no-sync pytest -v --cov=docling --cov-report=xml tests
      - name: Upload coverage to Codecov
        if: inputs.push_coverage
        uses: codecov/codecov-action@v5
@ -51,13 +60,58 @@ jobs:
        run: |
          for file in docs/examples/*.py; do
            # Skip batch_convert.py
-            if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
+            if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
                echo "Skipping $file"
                continue
            fi
            echo "Running example $file"
-            poetry run python "$file" || exit 1
+            uv run --no-sync python "$file" || exit 1
          done
-      - name: Build with poetry
+
-        run: poetry build
+  build-package:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ['3.12']
    steps:
      - uses: actions/checkout@v4
      - name: Install uv and set the python version
        uses: astral-sh/setup-uv@v5
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true
      - name: Install dependencies
        run: uv sync --all-extras
      - name: Build package
        run: uv build
      - name: Check content of wheel
        run: unzip -l dist/*.whl
      - name: Store the distribution packages
        uses: actions/upload-artifact@v4
        with:
          name: python-package-distributions
          path: dist/
  test-package:
    needs:
      - build-package
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ['3.12']
    steps:
      - name: Download all the dists
        uses: actions/download-artifact@v4
        with:
          name: python-package-distributions
          path: dist/
      - name: Install uv and set the python version
        uses: astral-sh/setup-uv@v5
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true
      - name: Install package
        run: uv pip install dist/*.whl
      - name: Run docling
        run: docling --help
--- a/.github/workflows/ci-docs.yml
+++ b/.github/workflows/ci-docs.yml
@ -8,6 +8,9 @@ on:
      - "**"
      - "!gh-pages"
 env:
  UV_FROZEN: "1"
 jobs:
  build-docs:
    if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -9,10 +9,6 @@ on:
      - "!main"
      - "!gh-pages"
 env:
  # disable keyring (https://github.com/actions/runner-images/issues/6185):
  PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
 jobs:
  code-checks:
    if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@ -6,14 +6,21 @@ on:
                description: "If true, the docs will be deployed."
                default: false
 env:
  UV_FROZEN: "1"
 jobs:
    run-docs:
        runs-on: ubuntu-latest
        steps:
        - uses: actions/checkout@v4
-        - uses: ./.github/actions/setup-poetry
+        - name: Install uv and set the python version
          uses: astral-sh/setup-uv@v5
          with:
            python-version: ${{ matrix.python-version }}
            enable-cache: true
        - name: Build docs
-          run: poetry run mkdocs build --verbose --clean
+          run: uv run mkdocs build --verbose --clean
        - name: Build and push docs
          if: inputs.deploy
-          run: poetry run mkdocs gh-deploy --force
+          run: uv run --no-sync mkdocs gh-deploy --force
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@ -4,16 +4,18 @@ on:
  release:
    types: [published]
 env:
  UV_FROZEN: "1"
 permissions:
  contents: read
 env:
  # disable keyring (https://github.com/actions/runner-images/issues/6185):
  PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
 jobs:
  build-and-publish:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ['3.12']
    environment:
      name: pypi
      url: https://pypi.org/p/docling
@ -21,9 +23,15 @@ jobs:
      id-token: write  # IMPORTANT: mandatory for trusted publishing
    steps:
      - uses: actions/checkout@v4
-      - uses: ./.github/actions/setup-poetry
+      - name: Install uv and set the python version
-      - name: Build and publish
+        uses: astral-sh/setup-uv@v5
-        run: poetry build
+        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true
      - name: Install dependencies
        run: uv sync --all-extras
      - name: Build package
        run: uv build
      - name: Publish distribution 📦 to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -17,12 +17,11 @@ repos:
    hooks:
      - id: mypy
        name: MyPy
-        entry: poetry run mypy docling
+        entry: uv run --no-sync mypy docling
        pass_filenames: false
        language: system
        files: '\.py$'
-      - id: poetry
+  - repo: https://github.com/astral-sh/uv-pre-commit
-        name: Poetry check
+    rev: 0.7.8
-        entry: poetry check --lock
+    hooks:
-        pass_filenames: false
+      - id: uv-lock
        language: system
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,35 @@
 ## [v2.36.1](https://github.com/docling-project/docling/releases/tag/v2.36.1) - 2025-06-04
 ### Fix
 * Remove typer and click constraints ([#1707](https://github.com/docling-project/docling/issues/1707)) ([`8846f1a`](https://github.com/docling-project/docling/commit/8846f1a393923a6badcca3a78a664a4dd31eae0d))
 ### Documentation
 * Flash-attn usage and install ([#1706](https://github.com/docling-project/docling/issues/1706)) ([`be42b03`](https://github.com/docling-project/docling/commit/be42b03f9b366bed33e95c1033b90c63f300b914))
 ## [v2.36.0](https://github.com/docling-project/docling/releases/tag/v2.36.0) - 2025-06-03
 ### Feature
 * Simplify dependencies, switch to uv ([#1700](https://github.com/docling-project/docling/issues/1700)) ([`cdd4018`](https://github.com/docling-project/docling/commit/cdd401847a35f16d69944eb6dddf57e4e0b65020))
 * New vlm-models support ([#1570](https://github.com/docling-project/docling/issues/1570)) ([`cfdf4ce`](https://github.com/docling-project/docling/commit/cfdf4cea25e681fc557df310b8bf34f3dd892e15))
 ## [v2.35.0](https://github.com/docling-project/docling/releases/tag/v2.35.0) - 2025-06-02
 ### Feature
 * Add visualization of bbox on page with html export. ([#1663](https://github.com/docling-project/docling/issues/1663)) ([`b356b33`](https://github.com/docling-project/docling/commit/b356b33059bdeeaf1584d9d189cbf1c4832e367c))
 ### Fix
 * Guess HTML content starting with script tag ([#1673](https://github.com/docling-project/docling/issues/1673)) ([`984cb13`](https://github.com/docling-project/docling/commit/984cb137f6a8ae2f3a63623add6c474d97ef8739))
 * UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte ([#1665](https://github.com/docling-project/docling/issues/1665)) ([`51d3450`](https://github.com/docling-project/docling/commit/51d34509156e2dbec9e697276681d59f9ca7e020))
 ### Documentation
 * Fix typo in index.md ([#1676](https://github.com/docling-project/docling/issues/1676)) ([`11ca4f7`](https://github.com/docling-project/docling/commit/11ca4f7a7bd8068bee472510dd71f1cd58f86f17))
 ## [v2.34.0](https://github.com/docling-project/docling/releases/tag/v2.34.0) - 2025-05-22
 ### Feature
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -6,70 +6,52 @@ For more details on the contributing guidelines head to the Docling Project [com
 ## Developing
-### Usage of Poetry
+### Usage of uv
-We use Poetry to manage dependencies.
+We use [uv](https://docs.astral.sh/uv/) as package and project manager.
 #### Installation
-To install Poetry, follow the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
+To install `uv`, check the documentation on [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
-1. Install Poetry globally on your machine:
+#### Create an environment and sync it
    ```bash
    curl -sSL https://install.python-poetry.org | python3 -
    ```
    The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
-2. Make sure Poetry is in your `$PATH`:
+You can use the `uv sync` to create a project virtual environment (if it does not already exist) and sync
-    - for `zsh`:
+the project's dependencies with the environment.
        ```sh
        echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
        ```
    - for `bash`:
        ```sh
        echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
        ```
 3. The official guidelines linked above include useful details on configuring autocomplete for most shell environments, e.g., Bash and Zsh.
 #### Create a Virtual Environment and Install Dependencies
 To activate the Virtual Environment, run:
 ```bash
-poetry shell
+uv sync
 ```
-This will spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
+#### Use a specific Python version (optional)
 If you need to work with a specific version of Python, you can create a new virtual environment for that version
 and run the sync command:
 ```bash
-poetry install
+uv venv --python 3.12
 uv sync
 ```
-**(Advanced) Use a Specific Python Version**
+More detailed options are described on the [Using Python environments](https://docs.astral.sh/uv/pip/environments/) documentation.
-If you need to work with a specific (older) version of Python, run:
+#### Add a new dependency
 Simply use the `uv add` command. The `pyproject.toml` and `uv.lock` files will be updated.
 ```bash
-poetry env use $(which python3.8)
+uv add [OPTIONS] <PACKAGES|--requirements <REQUIREMENTS>>
 ```
 This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` with the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
 #### Add a New Dependency
 ```bash
 poetry add NAME
 ```
 ## Coding Style Guidelines
 We use the following tools to enforce code style:
- iSort, to sort imports
+- [Ruff](https://docs.astral.sh/ruff/), as linter and code formatter
- Black, to format code
+- [MyPy](https://mypy.readthedocs.io), as static type checker
-We run a series of checks on the codebase on every commit using `pre-commit`. To install the hooks, run:
+A set of styling checks, as well as regression tests, are defined and managed through the [pre-commit](https://pre-commit.com/) framework.
 To ensure that those scripts run automatically before a commit is finalized, install `pre-commit` on your local repository:
 ```bash
 pre-commit install
@ -81,7 +63,7 @@ To run the checks on-demand, run:
 pre-commit run --all-files
 ```
-Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by its hooks. In these cases, `git add` the modified files and `git commit` again.
+Note: Checks like `Ruff` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by its hooks. In these cases, `git add` the modified files and `git commit` again.
 ## Tests
@ -94,7 +76,7 @@ When a change improves the conversion results, multiple reference documents must
 The reference data can be regenerated with
 ```sh
-DOCLING_GEN_TEST_DATA=1 poetry run pytest
+DOCLING_GEN_TEST_DATA=1 uv run pytest
 ```
 All PRs modifying the reference test data require a double review to guarantee we don't miss edge cases.
--- a/README.md
+++ b/README.md
@ -14,9 +14,8 @@
 [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://docling-project.github.io/docling/)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
-[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
+[![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
 [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
@ -36,7 +35,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 🔒 Local execution capabilities for sensitive data and air-gapped environments
 * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
+* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
 * 💻 Simple and convenient CLI
 ### Coming soon
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -28,6 +28,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
@ -36,8 +37,6 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    EasyOcrOptions,
    OcrOptions,
    PaginatedPipelineOptions,
@ -45,14 +44,16 @@ from docling.datamodel.pipeline_options import (
    PdfPipeline,
    PdfPipelineOptions,
    TableFormerMode,
    VlmModelType,
    VlmPipelineOptions,
    granite_vision_vlm_conversion_options,
    granite_vision_vlm_ollama_conversion_options,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.datamodel.vlm_model_specs import (
    GRANITE_VISION_OLLAMA,
    GRANITE_VISION_TRANSFORMERS,
    SMOLDOCLING_MLX,
    SMOLDOCLING_TRANSFORMERS,
    VlmModelType,
 )
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
 from docling.pipeline.vlm_pipeline import VlmPipeline
@ -579,20 +580,16 @@ def convert(  # noqa: C901
            )
            if vlm_model == VlmModelType.GRANITE_VISION:
-                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
-                pipeline_options.vlm_options = (
+                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
                    granite_vision_vlm_ollama_conversion_options
                )
            elif vlm_model == VlmModelType.SMOLDOCLING:
-                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                if sys.platform == "darwin":
                    try:
                        import mlx_vlm
-                        pipeline_options.vlm_options = (
+                        pipeline_options.vlm_options = SMOLDOCLING_MLX
                            smoldocling_vlm_mlx_conversion_options
                        )
                    except ImportError:
                        _log.warning(
                            "To run SmolDocling faster, please install mlx-vlm:\n"
--- a/docling/datamodel/accelerator_options.py
+++ b/docling/datamodel/accelerator_options.py
@ -0,0 +1,68 @@
 import logging
 import os
 import re
 from enum import Enum
 from typing import Any, Union
 from pydantic import field_validator, model_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 _log = logging.getLogger(__name__)
 class AcceleratorDevice(str, Enum):
    """Devices to run model inference"""
    AUTO = "auto"
    CPU = "cpu"
    CUDA = "cuda"
    MPS = "mps"
 class AcceleratorOptions(BaseSettings):
    model_config = SettingsConfigDict(
        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
    )
    num_threads: int = 4
    device: Union[str, AcceleratorDevice] = "auto"
    cuda_use_flash_attention2: bool = False
    @field_validator("device")
    def validate_device(cls, value):
        # "auto", "cpu", "cuda", "mps", or "cuda:N"
        if value in {d.value for d in AcceleratorDevice} or re.match(
            r"^cuda(:\d+)?$", value
        ):
            return value
        raise ValueError(
            "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
        )
    @model_validator(mode="before")
    @classmethod
    def check_alternative_envvars(cls, data: Any) -> Any:
        r"""
        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
        The alternative envvar is used only if it is valid and the regular envvar is not set.
        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
        the same functionality. In case the alias envvar is set and the user tries to override the
        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
        as an extra input instead of simply overwriting the evvar value for that parameter.
        """
        if isinstance(data, dict):
            input_num_threads = data.get("num_threads")
            # Check if to set the num_threads from the alternative envvar
            if input_num_threads is None:
                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
                omp_num_threads = os.getenv("OMP_NUM_THREADS")
                if docling_num_threads is None and omp_num_threads is not None:
                    try:
                        data["num_threads"] = int(omp_num_threads)
                    except ValueError:
                        _log.error(
                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
                            omp_num_threads,
                        )
        return data
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -13,11 +13,11 @@ from docling_core.types.doc import (
    TableCell,
 )
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 # DO NOT REMOVE; explicitly exposed from this location
 from docling_core.types.io import (
    DocumentStream,
 )
 # DO NOT REMOVE; explicitly exposed from this location
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict, Field, computed_field
@ -67,7 +67,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.MD: ["md"],
    InputFormat.HTML: ["html", "htm", "xhtml"],
    InputFormat.XML_JATS: ["xml", "nxml"],
-    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
+    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
    InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
    InputFormat.CSV: ["csv"],
    InputFormat.XLSX: ["xlsx"],
@ -131,12 +131,6 @@ class ErrorItem(BaseModel):
    error_message: str
 # class Cell(BaseModel):
 #    id: int
 #    text: str
 #    bbox: BoundingBox
 class Cluster(BaseModel):
    id: int
    label: DocItemLabel
@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel):
    clusters: List[Cluster] = []
 class VlmPredictionToken(BaseModel):
    text: str = ""
    token: int = -1
    logprob: float = -1
 class VlmPrediction(BaseModel):
    text: str = ""
    generated_tokens: list[VlmPredictionToken] = []
    generation_time: float = -1
 class ContainerElement(
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -412,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
            else:
                return "application/xml"
-        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
+        if re.match(
            r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
            content_str,
            re.DOTALL,
        ):
            return "text/html"
        p = re.compile(
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -1,6 +1,4 @@
 import logging
 import os
 import re
 from enum import Enum
 from pathlib import Path
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@ -10,73 +8,28 @@ from pydantic import (
    BaseModel,
    ConfigDict,
    Field,
    field_validator,
    model_validator,
 )
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from typing_extensions import deprecated
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.pipeline_options_vlm_model import (
    ApiVlmOptions,
    InferenceFramework,
    InlineVlmOptions,
    ResponseFormat,
 )
 from docling.datamodel.vlm_model_specs import (
    GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
    GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
    SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
    SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
    VlmModelType,
 )
 _log = logging.getLogger(__name__)
 class AcceleratorDevice(str, Enum):
    """Devices to run model inference"""
    AUTO = "auto"
    CPU = "cpu"
    CUDA = "cuda"
    MPS = "mps"
 class AcceleratorOptions(BaseSettings):
    model_config = SettingsConfigDict(
        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
    )
    num_threads: int = 4
    device: Union[str, AcceleratorDevice] = "auto"
    cuda_use_flash_attention2: bool = False
    @field_validator("device")
    def validate_device(cls, value):
        # "auto", "cpu", "cuda", "mps", or "cuda:N"
        if value in {d.value for d in AcceleratorDevice} or re.match(
            r"^cuda(:\d+)?$", value
        ):
            return value
        raise ValueError(
            "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
        )
    @model_validator(mode="before")
    @classmethod
    def check_alternative_envvars(cls, data: Any) -> Any:
        r"""
        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
        The alternative envvar is used only if it is valid and the regular envvar is not set.
        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
        the same functionality. In case the alias envvar is set and the user tries to override the
        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
        as an extra input instead of simply overwriting the evvar value for that parameter.
        """
        if isinstance(data, dict):
            input_num_threads = data.get("num_threads")
            # Check if to set the num_threads from the alternative envvar
            if input_num_threads is None:
                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
                omp_num_threads = os.getenv("OMP_NUM_THREADS")
                if docling_num_threads is None and omp_num_threads is not None:
                    try:
                        data["num_threads"] = int(omp_num_threads)
                    except ValueError:
                        _log.error(
                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
                            omp_num_threads,
                        )
        return data
 class BaseOptions(BaseModel):
    """Base class for options."""
@ -121,24 +74,22 @@ class RapidOcrOptions(OcrOptions):
    lang: List[str] = [
        "english",
        "chinese",
-    ]  # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
+    ]
-    # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
+    # However, language as a parameter is not supported by rapidocr yet
    # and hence changing this options doesn't affect anything.
    # For more details on supported languages by RapidOCR visit
    # https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
    # For more details on the following options visit
    # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
    # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
    text_score: float = 0.5  # same default as rapidocr
    use_det: Optional[bool] = None  # same default as rapidocr
    use_cls: Optional[bool] = None  # same default as rapidocr
    use_rec: Optional[bool] = None  # same default as rapidocr
    # class Device(Enum):
    #     CPU = "CPU"
    #     CUDA = "CUDA"
    #     DIRECTML = "DIRECTML"
    #     AUTO = "AUTO"
    # device: Device = Device.AUTO  # Default value is AUTO
    print_verbose: bool = False  # same default as rapidocr
    det_model_path: Optional[str] = None  # same default as rapidocr
@ -244,101 +195,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
        return self.repo_id.replace("/", "--")
 # SmolVLM
 smolvlm_picture_description = PictureDescriptionVlmOptions(
    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
 )
-# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
+
 # GraniteVision
 granite_picture_description = PictureDescriptionVlmOptions(
    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
    prompt="What is shown in this image?",
 )
 class BaseVlmOptions(BaseModel):
    kind: str
    prompt: str
 class ResponseFormat(str, Enum):
    DOCTAGS = "doctags"
    MARKDOWN = "markdown"
 class InferenceFramework(str, Enum):
    MLX = "mlx"
    TRANSFORMERS = "transformers"
    OPENAI = "openai"
 class HuggingFaceVlmOptions(BaseVlmOptions):
    kind: Literal["hf_model_options"] = "hf_model_options"
    repo_id: str
    load_in_8bit: bool = True
    llm_int8_threshold: float = 6.0
    quantized: bool = False
    inference_framework: InferenceFramework
    response_format: ResponseFormat
    @property
    def repo_cache_folder(self) -> str:
        return self.repo_id.replace("/", "--")
 class ApiVlmOptions(BaseVlmOptions):
    kind: Literal["api_model_options"] = "api_model_options"
    url: AnyUrl = AnyUrl(
        "http://localhost:11434/v1/chat/completions"
    )  # Default to ollama
    headers: Dict[str, str] = {}
    params: Dict[str, Any] = {}
    scale: float = 2.0
    timeout: float = 60
    concurrency: int = 1
    response_format: ResponseFormat
 smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
    inference_framework=InferenceFramework.MLX,
 )
 smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
    inference_framework=InferenceFramework.TRANSFORMERS,
 )
 granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
    # prompt="OCR the full page to markdown.",
    prompt="OCR this image.",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.TRANSFORMERS,
 )
 granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
    url=AnyUrl("http://localhost:11434/v1/chat/completions"),
    params={"model": "granite3.2-vision:2b"},
    prompt="OCR the full page to markdown.",
    scale=1.0,
    timeout=120,
    response_format=ResponseFormat.MARKDOWN,
 )
 class VlmModelType(str, Enum):
    SMOLDOCLING = "smoldocling"
    GRANITE_VISION = "granite_vision"
    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
    """Enum of valid PDF backends."""
@ -387,7 +255,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
        False  # (To be used with vlms, or other generative models)
    )
    # If True, text from backend will be used instead of generated text
-    vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
        smoldocling_vlm_conversion_options
    )
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@ -0,0 +1,81 @@
 from enum import Enum
 from typing import Any, Dict, List, Literal
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 from docling.datamodel.accelerator_options import AcceleratorDevice
 class BaseVlmOptions(BaseModel):
    kind: str
    prompt: str
 class ResponseFormat(str, Enum):
    DOCTAGS = "doctags"
    MARKDOWN = "markdown"
    HTML = "html"
 class InferenceFramework(str, Enum):
    MLX = "mlx"
    TRANSFORMERS = "transformers"
 class TransformersModelType(str, Enum):
    AUTOMODEL = "automodel"
    AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
    AUTOMODEL_CAUSALLM = "automodel-causallm"
 class InlineVlmOptions(BaseVlmOptions):
    kind: Literal["inline_model_options"] = "inline_model_options"
    repo_id: str
    trust_remote_code: bool = False
    load_in_8bit: bool = True
    llm_int8_threshold: float = 6.0
    quantized: bool = False
    inference_framework: InferenceFramework
    transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
    response_format: ResponseFormat
    supported_devices: List[AcceleratorDevice] = [
        AcceleratorDevice.CPU,
        AcceleratorDevice.CUDA,
        AcceleratorDevice.MPS,
    ]
    scale: float = 2.0
    temperature: float = 0.0
    stop_strings: List[str] = []
    extra_generation_config: Dict[str, Any] = {}
    use_kv_cache: bool = True
    max_new_tokens: int = 4096
    @property
    def repo_cache_folder(self) -> str:
        return self.repo_id.replace("/", "--")
@deprecated("Use InlineVlmOptions instead.")
 class HuggingFaceVlmOptions(InlineVlmOptions):
    pass
 class ApiVlmOptions(BaseVlmOptions):
    kind: Literal["api_model_options"] = "api_model_options"
    url: AnyUrl = AnyUrl(
        "http://localhost:11434/v1/chat/completions"
    )  # Default to ollama
    headers: Dict[str, str] = {}
    params: Dict[str, Any] = {}
    scale: float = 2.0
    timeout: float = 60
    concurrency: int = 1
    response_format: ResponseFormat
--- a/docling/datamodel/vlm_model_specs.py
+++ b/docling/datamodel/vlm_model_specs.py
@ -0,0 +1,144 @@
 import logging
 from enum import Enum
 from pydantic import (
    AnyUrl,
 )
 from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.pipeline_options_vlm_model import (
    ApiVlmOptions,
    InferenceFramework,
    InlineVlmOptions,
    ResponseFormat,
    TransformersModelType,
 )
 _log = logging.getLogger(__name__)
 # SmolDocling
 SMOLDOCLING_MLX = InlineVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
    inference_framework=InferenceFramework.MLX,
    supported_devices=[AcceleratorDevice.MPS],
    scale=2.0,
    temperature=0.0,
 )
 SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
    inference_framework=InferenceFramework.TRANSFORMERS,
    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
    supported_devices=[
        AcceleratorDevice.CPU,
        AcceleratorDevice.CUDA,
        AcceleratorDevice.MPS,
    ],
    scale=2.0,
    temperature=0.0,
 )
 # GraniteVision
 GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
    repo_id="ibm-granite/granite-vision-3.2-2b",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.TRANSFORMERS,
    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
    supported_devices=[
        AcceleratorDevice.CPU,
        AcceleratorDevice.CUDA,
        AcceleratorDevice.MPS,
    ],
    scale=2.0,
    temperature=0.0,
 )
 GRANITE_VISION_OLLAMA = ApiVlmOptions(
    url=AnyUrl("http://localhost:11434/v1/chat/completions"),
    params={"model": "granite3.2-vision:2b"},
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    scale=1.0,
    timeout=120,
    response_format=ResponseFormat.MARKDOWN,
    temperature=0.0,
 )
 # Pixtral
 PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions(
    repo_id="mistral-community/pixtral-12b",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.TRANSFORMERS,
    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
    supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
    scale=2.0,
    temperature=0.0,
 )
 PIXTRAL_12B_MLX = InlineVlmOptions(
    repo_id="mlx-community/pixtral-12b-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    supported_devices=[AcceleratorDevice.MPS],
    scale=2.0,
    temperature=0.0,
 )
 # Phi4
 PHI4_TRANSFORMERS = InlineVlmOptions(
    repo_id="microsoft/Phi-4-multimodal-instruct",
    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
    trust_remote_code=True,
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.TRANSFORMERS,
    transformers_model_type=TransformersModelType.AUTOMODEL_CAUSALLM,
    supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
    scale=2.0,
    temperature=0.0,
    extra_generation_config=dict(num_logits_to_keep=0),
 )
 # Qwen
 QWEN25_VL_3B_MLX = InlineVlmOptions(
    repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    supported_devices=[AcceleratorDevice.MPS],
    scale=2.0,
    temperature=0.0,
 )
 # Gemma-3
 GEMMA3_12B_MLX = InlineVlmOptions(
    repo_id="mlx-community/gemma-3-12b-it-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    supported_devices=[AcceleratorDevice.MPS],
    scale=2.0,
    temperature=0.0,
 )
 GEMMA3_27B_MLX = InlineVlmOptions(
    repo_id="mlx-community/gemma-3-27b-it-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    supported_devices=[AcceleratorDevice.MPS],
    scale=2.0,
    temperature=0.0,
 )
 class VlmModelType(str, Enum):
    SMOLDOCLING = "smoldocling"
    GRANITE_VISION = "granite_vision"
    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -186,6 +186,11 @@ class DocumentConverter:
            Tuple[Type[BasePipeline], str], BasePipeline
        ] = {}
    def _get_initialized_pipelines(
        self,
    ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
        return self.initialized_pipelines
    def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
        """Generate a hash of pipeline options to use as part of the cache key."""
        options_str = str(pipeline_options.model_dump())
--- a/docling/models/api_vlm_model.py
+++ b/docling/models/api_vlm_model.py
@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import ApiVlmOptions
+from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
 from docling.exceptions import OperationNotAllowed
 from docling.models.base_model import BasePageModel
 from docling.utils.api_image_request import api_image_request
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -11,9 +11,10 @@ from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
+from docling.datamodel.pipeline_options import OcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BaseModelWithOptions, BasePageModel
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@ -16,9 +16,10 @@ from docling_core.types.doc.labels import CodeLanguageLabel
 from PIL import Image, ImageOps
 from pydantic import BaseModel
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement
 from docling.datamodel.pipeline_options import AcceleratorOptions
 from docling.models.base_model import BaseItemAndImageEnrichmentModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
@ -117,20 +118,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
        force: bool = False,
        progress: bool = False,
    ) -> Path:
-        from huggingface_hub import snapshot_download
+        return download_hf_model(
        from huggingface_hub.utils import disable_progress_bars
        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id="ds4sd/CodeFormula",
            force_download=force,
            local_dir=local_dir,
            revision="v1.0.2",
            local_dir=local_dir,
            force=force,
            progress=progress,
        )
        return Path(download_path)
    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
        """
        Determines if a given element in a document can be processed by the model.
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@ -13,8 +13,9 @@ from docling_core.types.doc import (
 from PIL import Image
 from pydantic import BaseModel
-from docling.datamodel.pipeline_options import AcceleratorOptions
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.models.base_model import BaseEnrichmentModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
@ -105,20 +106,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
    def download_models(
        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
    ) -> Path:
-        from huggingface_hub import snapshot_download
+        return download_hf_model(
        from huggingface_hub.utils import disable_progress_bars
        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id="ds4sd/DocumentFigureClassifier",
            force_download=force,
            local_dir=local_dir,
            revision="v1.0.1",
            local_dir=local_dir,
            force=force,
            progress=progress,
        )
        return Path(download_path)
    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
        """
        Determines if the given element can be processed by the classifier.
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -9,11 +9,10 @@ import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    EasyOcrOptions,
    OcrOptions,
 )
--- a/docling/models/hf_vlm_model.py
+++ b/docling/models/hf_vlm_model.py
@ -1,182 +0,0 @@
 import logging
 import time
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    HuggingFaceVlmOptions,
 )
 from docling.models.base_model import BasePageModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 class HuggingFaceVlmModel(BasePageModel):
    def __init__(
        self,
        enabled: bool,
        artifacts_path: Optional[Path],
        accelerator_options: AcceleratorOptions,
        vlm_options: HuggingFaceVlmOptions,
    ):
        self.enabled = enabled
        self.vlm_options = vlm_options
        if self.enabled:
            import torch
            from transformers import (  # type: ignore
                AutoModelForVision2Seq,
                AutoProcessor,
                BitsAndBytesConfig,
            )
            device = decide_device(accelerator_options.device)
            self.device = device
            _log.debug(f"Available device for HuggingFace VLM: {device}")
            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
            # PARAMETERS:
            if artifacts_path is None:
                artifacts_path = self.download_models(self.vlm_options.repo_id)
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder
            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
            self.param_quantization_config = BitsAndBytesConfig(
                load_in_8bit=vlm_options.load_in_8bit,  # True,
                llm_int8_threshold=vlm_options.llm_int8_threshold,  # 6.0
            )
            self.param_quantized = vlm_options.quantized  # False
            self.processor = AutoProcessor.from_pretrained(artifacts_path)
            if not self.param_quantized:
                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
                    artifacts_path,
                    device_map=device,
                    torch_dtype=torch.bfloat16,
                    _attn_implementation=(
                        "flash_attention_2"
                        if self.device.startswith("cuda")
                        and accelerator_options.cuda_use_flash_attention2
                        else "eager"
                    ),
                )  # .to(self.device)
            else:
                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
                    artifacts_path,
                    device_map=device,
                    torch_dtype="auto",
                    quantization_config=self.param_quantization_config,
                    _attn_implementation=(
                        "flash_attention_2"
                        if self.device.startswith("cuda")
                        and accelerator_options.cuda_use_flash_attention2
                        else "eager"
                    ),
                )  # .to(self.device)
    @staticmethod
    def download_models(
        repo_id: str,
        local_dir: Optional[Path] = None,
        force: bool = False,
        progress: bool = False,
    ) -> Path:
        from huggingface_hub import snapshot_download
        from huggingface_hub.utils import disable_progress_bars
        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id=repo_id,
            force_download=force,
            local_dir=local_dir,
            # revision="v0.0.1",
        )
        return Path(download_path)
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "vlm"):
                    assert page.size is not None
                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size
                    # populate page_tags with predicted doc tags
                    page_tags = ""
                    if hi_res_image:
                        if hi_res_image.mode != "RGB":
                            hi_res_image = hi_res_image.convert("RGB")
                    messages = [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": "This is a page from a document.",
                                },
                                {"type": "image"},
                                {"type": "text", "text": self.param_question},
                            ],
                        }
                    ]
                    prompt = self.processor.apply_chat_template(
                        messages, add_generation_prompt=False
                    )
                    inputs = self.processor(
                        text=prompt, images=[hi_res_image], return_tensors="pt"
                    )
                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
                    start_time = time.time()
                    # Call model to generate:
                    generated_ids = self.vlm_model.generate(
                        **inputs, max_new_tokens=4096, use_cache=True
                    )
                    generation_time = time.time() - start_time
                    generated_texts = self.processor.batch_decode(
                        generated_ids[:, inputs["input_ids"].shape[1] :],
                        skip_special_tokens=False,
                    )[0]
                    num_tokens = len(generated_ids[0])
                    page_tags = generated_texts
                    _log.debug(
                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                    )
                    # inference_time = time.time() - start_time
                    # tokens_per_second = num_tokens / generation_time
                    # print("")
                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
                    # print(f"Total tokens on page: {num_tokens:.2f}")
                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
                    # print("")
                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
                yield page
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -10,11 +10,12 @@ from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import AcceleratorOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
 from docling.utils.profiling import TimeRecorder
@ -83,20 +84,14 @@ class LayoutModel(BasePageModel):
        force: bool = False,
        progress: bool = False,
    ) -> Path:
-        from huggingface_hub import snapshot_download
+        return download_hf_model(
        from huggingface_hub.utils import disable_progress_bars
        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id="ds4sd/docling-models",
-            force_download=force,
+            revision="v2.2.0",
            local_dir=local_dir,
-            revision="v2.1.0",
+            force=force,
            progress=progress,
        )
        return Path(download_path)
    def draw_clusters_and_cells_side_by_side(
        self, conv_res, page, clusters, mode_prefix: str, show: bool = False
    ):
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@ -8,10 +8,10 @@ from typing import Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    OcrMacOptions,
    OcrOptions,
 )
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@ -5,8 +5,8 @@ from typing import Optional, Type, Union
 from PIL import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    PictureDescriptionApiOptions,
    PictureDescriptionBaseOptions,
 )
--- a/docling/models/picture_description_base_model.py
+++ b/docling/models/picture_description_base_model.py
@ -13,8 +13,8 @@ from docling_core.types.doc.document import (  # TODO: move import to docling_co
 )
 from PIL import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    PictureDescriptionBaseOptions,
 )
 from docling.models.base_model import (
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@ -4,16 +4,21 @@ from typing import Optional, Type, Union
 from PIL import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    PictureDescriptionBaseOptions,
    PictureDescriptionVlmOptions,
 )
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel
 from docling.models.utils.hf_model_download import (
    HuggingFaceModelDownloadMixin,
 )
 from docling.utils.accelerator_utils import decide_device
-class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
+class PictureDescriptionVlmModel(
    PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
 ):
    @classmethod
    def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
        return PictureDescriptionVlmOptions
@ -66,26 +71,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
            self.provenance = f"{self.options.repo_id}"
    @staticmethod
    def download_models(
        repo_id: str,
        local_dir: Optional[Path] = None,
        force: bool = False,
        progress: bool = False,
    ) -> Path:
        from huggingface_hub import snapshot_download
        from huggingface_hub.utils import disable_progress_bars
        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id=repo_id,
            force_download=force,
            local_dir=local_dir,
        )
        return Path(download_path)
    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
        from transformers import GenerationConfig
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -7,11 +7,10 @@ import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    OcrOptions,
    RapidOcrOptions,
 )
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -13,16 +13,16 @@ from docling_core.types.doc.page import (
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    TableFormerMode,
    TableStructureOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
@ -90,20 +90,14 @@ class TableStructureModel(BasePageModel):
    def download_models(
        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
    ) -> Path:
-        from huggingface_hub import snapshot_download
+        return download_hf_model(
        from huggingface_hub.utils import disable_progress_bars
        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id="ds4sd/docling-models",
            force_download=force,
            local_dir=local_dir,
            revision="v2.2.0",
            local_dir=local_dir,
            force=force,
            progress=progress,
        )
        return Path(download_path)
    def draw_table_and_cells(
        self,
        conv_res: ConversionResult,
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -13,10 +13,10 @@ import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import TextCell
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    OcrOptions,
    TesseractCliOcrOptions,
 )
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -7,10 +7,10 @@ from typing import Iterable, Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import TextCell
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    OcrOptions,
    TesseractOcrOptions,
 )
--- a/docling/models/utils/init.py
+++ b/docling/models/utils/init.py
--- a/docling/models/utils/hf_model_download.py
+++ b/docling/models/utils/hf_model_download.py
@ -0,0 +1,40 @@
 import logging
 from pathlib import Path
 from typing import Optional
 _log = logging.getLogger(__name__)
 def download_hf_model(
    repo_id: str,
    local_dir: Optional[Path] = None,
    force: bool = False,
    progress: bool = False,
    revision: Optional[str] = None,
 ) -> Path:
    from huggingface_hub import snapshot_download
    from huggingface_hub.utils import disable_progress_bars
    if not progress:
        disable_progress_bars()
    download_path = snapshot_download(
        repo_id=repo_id,
        force_download=force,
        local_dir=local_dir,
        revision=revision,
    )
    return Path(download_path)
 class HuggingFaceModelDownloadMixin:
    @staticmethod
    def download_models(
        repo_id: str,
        local_dir: Optional[Path] = None,
        force: bool = False,
        progress: bool = False,
    ) -> Path:
        return download_hf_model(
            repo_id=repo_id, local_dir=local_dir, force=force, progress=progress
        )
--- a/docling/models/vlm_models_inline/init.py
+++ b/docling/models/vlm_models_inline/init.py
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@ -0,0 +1,194 @@
 import importlib.metadata
 import logging
 import time
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Any, Optional
 from docling.datamodel.accelerator_options import (
    AcceleratorOptions,
 )
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import (
    InlineVlmOptions,
    TransformersModelType,
 )
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import (
    HuggingFaceModelDownloadMixin,
 )
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
    def __init__(
        self,
        enabled: bool,
        artifacts_path: Optional[Path],
        accelerator_options: AcceleratorOptions,
        vlm_options: InlineVlmOptions,
    ):
        self.enabled = enabled
        self.vlm_options = vlm_options
        if self.enabled:
            import torch
            from transformers import (
                AutoModel,
                AutoModelForCausalLM,
                AutoModelForVision2Seq,
                AutoProcessor,
                BitsAndBytesConfig,
                GenerationConfig,
            )
            transformers_version = importlib.metadata.version("transformers")
            if (
                self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct"
                and transformers_version >= "4.52.0"
            ):
                raise NotImplementedError(
                    f"Phi 4 only works with transformers<4.52.0 but you have {transformers_version=}. Please downgrage running pip install -U 'transformers<4.52.0'."
                )
            self.device = decide_device(
                accelerator_options.device,
                supported_devices=vlm_options.supported_devices,
            )
            _log.debug(f"Available device for VLM: {self.device}")
            self.use_cache = vlm_options.use_kv_cache
            self.max_new_tokens = vlm_options.max_new_tokens
            self.temperature = vlm_options.temperature
            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
            if artifacts_path is None:
                artifacts_path = self.download_models(self.vlm_options.repo_id)
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder
            self.param_quantization_config: Optional[BitsAndBytesConfig] = None
            if vlm_options.quantized:
                self.param_quantization_config = BitsAndBytesConfig(
                    load_in_8bit=vlm_options.load_in_8bit,
                    llm_int8_threshold=vlm_options.llm_int8_threshold,
                )
            model_cls: Any = AutoModel
            if (
                self.vlm_options.transformers_model_type
                == TransformersModelType.AUTOMODEL_CAUSALLM
            ):
                model_cls = AutoModelForCausalLM
            elif (
                self.vlm_options.transformers_model_type
                == TransformersModelType.AUTOMODEL_VISION2SEQ
            ):
                model_cls = AutoModelForVision2Seq
            self.processor = AutoProcessor.from_pretrained(
                artifacts_path,
                trust_remote_code=vlm_options.trust_remote_code,
            )
            self.vlm_model = model_cls.from_pretrained(
                artifacts_path,
                device_map=self.device,
                _attn_implementation=(
                    "flash_attention_2"
                    if self.device.startswith("cuda")
                    and accelerator_options.cuda_use_flash_attention2
                    else "eager"
                ),
                trust_remote_code=vlm_options.trust_remote_code,
            )
            # Load generation config
            self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "vlm"):
                    assert page.size is not None
                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
                    # Define prompt structure
                    prompt = self.formulate_prompt()
                    inputs = self.processor(
                        text=prompt, images=[hi_res_image], return_tensors="pt"
                    ).to(self.device)
                    start_time = time.time()
                    # Call model to generate:
                    generated_ids = self.vlm_model.generate(
                        **inputs,
                        max_new_tokens=self.max_new_tokens,
                        use_cache=self.use_cache,
                        temperature=self.temperature,
                        generation_config=self.generation_config,
                        **self.vlm_options.extra_generation_config,
                    )
                    generation_time = time.time() - start_time
                    generated_texts = self.processor.batch_decode(
                        generated_ids[:, inputs["input_ids"].shape[1] :],
                        skip_special_tokens=False,
                    )[0]
                    num_tokens = len(generated_ids[0])
                    _log.debug(
                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                    )
                    page.predictions.vlm_response = VlmPrediction(
                        text=generated_texts,
                        generation_time=generation_time,
                    )
                yield page
    def formulate_prompt(self) -> str:
        """Formulate a prompt for the VLM."""
        if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
            _log.debug("Using specialized prompt for Phi-4")
            # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
            user_prompt = "<|user|>"
            assistant_prompt = "<|assistant|>"
            prompt_suffix = "<|end|>"
            prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
            return prompt
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "This is a page from a document.",
                    },
                    {"type": "image"},
                    {"type": "text", "text": self.vlm_options.prompt},
                ],
            }
        ]
        prompt = self.processor.apply_chat_template(
            messages, add_generation_prompt=False
        )
        return prompt
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@ -4,29 +4,34 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
-from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.accelerator_options import (
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    HuggingFaceVlmOptions,
 )
 from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import (
    HuggingFaceModelDownloadMixin,
 )
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
-class HuggingFaceMlxModel(BasePageModel):
+class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
    def __init__(
        self,
        enabled: bool,
        artifacts_path: Optional[Path],
        accelerator_options: AcceleratorOptions,
-        vlm_options: HuggingFaceVlmOptions,
+        vlm_options: InlineVlmOptions,
    ):
        self.enabled = enabled
        self.vlm_options = vlm_options
        self.max_tokens = vlm_options.max_new_tokens
        self.temperature = vlm_options.temperature
        if self.enabled:
            try:
@ -39,42 +44,24 @@ class HuggingFaceMlxModel(BasePageModel):
                )
            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
            self.apply_chat_template = apply_chat_template
            self.stream_generate = stream_generate
            # PARAMETERS:
            if artifacts_path is None:
-                artifacts_path = self.download_models(self.vlm_options.repo_id)
+                artifacts_path = self.download_models(
                    self.vlm_options.repo_id,
                )
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder
-            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
+            self.param_question = vlm_options.prompt
            ## Load the model
            self.vlm_model, self.processor = load(artifacts_path)
            self.config = load_config(artifacts_path)
    @staticmethod
    def download_models(
        repo_id: str,
        local_dir: Optional[Path] = None,
        force: bool = False,
        progress: bool = False,
    ) -> Path:
        from huggingface_hub import snapshot_download
        from huggingface_hub.utils import disable_progress_bars
        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id=repo_id,
            force_download=force,
            local_dir=local_dir,
            # revision="v0.0.1",
        )
        return Path(download_path)
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
@ -83,12 +70,10 @@ class HuggingFaceMlxModel(BasePageModel):
            if not page._backend.is_valid():
                yield page
            else:
-                with TimeRecorder(conv_res, "vlm"):
+                with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
                    assert page.size is not None
-                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
+                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size
@ -104,16 +89,45 @@ class HuggingFaceMlxModel(BasePageModel):
                    )
                    start_time = time.time()
                    _log.debug("start generating ...")
                    # Call model to generate:
                    tokens: list[VlmPredictionToken] = []
                    output = ""
                    for token in self.stream_generate(
                        self.vlm_model,
                        self.processor,
                        prompt,
                        [hi_res_image],
-                        max_tokens=4096,
+                        max_tokens=self.max_tokens,
                        verbose=False,
                        temp=self.temperature,
                    ):
                        if len(token.logprobs.shape) == 1:
                            tokens.append(
                                VlmPredictionToken(
                                    text=token.text,
                                    token=token.token,
                                    logprob=token.logprobs[token.token],
                                )
                            )
                        elif (
                            len(token.logprobs.shape) == 2
                            and token.logprobs.shape[0] == 1
                        ):
                            tokens.append(
                                VlmPredictionToken(
                                    text=token.text,
                                    token=token.token,
                                    logprob=token.logprobs[0, token.token],
                                )
                            )
                        else:
                            _log.warning(
                                f"incompatible shape for logprobs: {token.logprobs.shape}"
                            )
                        output += token.text
                        if "</doctag>" in token.text:
                            break
@ -121,15 +135,13 @@ class HuggingFaceMlxModel(BasePageModel):
                    generation_time = time.time() - start_time
                    page_tags = output
-                    _log.debug(f"Generation time {generation_time:.2f} seconds.")
+                    _log.debug(
-
+                        f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
-                    # inference_time = time.time() - start_time
+                    )
-                    # tokens_per_second = num_tokens / generation_time
+                    page.predictions.vlm_response = VlmPrediction(
-                    # print("")
+                        text=page_tags,
-                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
+                        generation_time=generation_time,
-                    # print(f"Total tokens on page: {num_tokens:.2f}")
+                        generated_tokens=tokens,
-                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
+                    )
                    # print("")
                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
                yield page
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -1,29 +1,46 @@
 import logging
 import re
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
-from docling_core.types import DoclingDocument
+from docling_core.types.doc import (
-from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
+    BoundingBox,
    DocItem,
    DoclingDocument,
    ImageRef,
    PictureItem,
    ProvenanceItem,
    TextItem,
 )
 from docling_core.types.doc.base import (
    BoundingBox,
    Size,
 )
 from docling_core.types.doc.document import DocTagsDocument
 from PIL import Image as PILImage
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import (
    ApiVlmOptions,
    HuggingFaceVlmOptions,
    InferenceFramework,
    ResponseFormat,
    VlmPipelineOptions,
 )
 from docling.datamodel.pipeline_options_vlm_model import (
    ApiVlmOptions,
    InferenceFramework,
    InlineVlmOptions,
    ResponseFormat,
 )
 from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel
-from docling.models.hf_mlx_model import HuggingFaceMlxModel
+from docling.models.vlm_models_inline.hf_transformers_model import (
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
+    HuggingFaceTransformersVlmModel,
 )
 from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -66,8 +83,8 @@ class VlmPipeline(PaginatedPipeline):
                    vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
                ),
            ]
-        elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
+        elif isinstance(self.pipeline_options.vlm_options, InlineVlmOptions):
-            vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
+            vlm_options = cast(InlineVlmOptions, self.pipeline_options.vlm_options)
            if vlm_options.inference_framework == InferenceFramework.MLX:
                self.build_pipe = [
                    HuggingFaceMlxModel(
@ -77,15 +94,19 @@ class VlmPipeline(PaginatedPipeline):
                        vlm_options=vlm_options,
                    ),
                ]
-            else:
+            elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
                self.build_pipe = [
-                    HuggingFaceVlmModel(
+                    HuggingFaceTransformersVlmModel(
                        enabled=True,  # must be always enabled for this pipeline to make sense.
                        artifacts_path=artifacts_path,
                        accelerator_options=pipeline_options.accelerator_options,
                        vlm_options=vlm_options,
                    ),
                ]
            else:
                raise ValueError(
                    f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
                )
        self.enrichment_pipe = [
            # Other models working on `NodeItem` elements in the DoclingDocument
@ -116,49 +137,19 @@ class VlmPipeline(PaginatedPipeline):
                self.pipeline_options.vlm_options.response_format
                == ResponseFormat.DOCTAGS
            ):
-                doctags_list = []
+                conv_res.document = self._turn_dt_into_doc(conv_res)
                image_list = []
                for page in conv_res.pages:
                    predicted_doctags = ""
                    img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
                    if page.predictions.vlm_response:
                        predicted_doctags = page.predictions.vlm_response.text
                    if page.image:
                        img = page.image
                    image_list.append(img)
                    doctags_list.append(predicted_doctags)
                doctags_list_c = cast(List[Union[Path, str]], doctags_list)
                image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
                    doctags_list_c, image_list_c
                )
                conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
                # If forced backend text, replace model predicted text with backend one
                if self.force_backend_text:
                    scale = self.pipeline_options.images_scale
                    for element, _level in conv_res.document.iterate_items():
                        if not isinstance(element, TextItem) or len(element.prov) == 0:
                            continue
                        page_ix = element.prov[0].page_no - 1
                        page = conv_res.pages[page_ix]
                        if not page.size:
                            continue
                        crop_bbox = (
                            element.prov[0]
                            .bbox.scaled(scale=scale)
                            .to_top_left_origin(page_height=page.size.height * scale)
                        )
                        txt = self.extract_text_from_backend(page, crop_bbox)
                        element.text = txt
                        element.orig = txt
            elif (
                self.pipeline_options.vlm_options.response_format
                == ResponseFormat.MARKDOWN
            ):
                conv_res.document = self._turn_md_into_doc(conv_res)
            elif (
                self.pipeline_options.vlm_options.response_format == ResponseFormat.HTML
            ):
                conv_res.document = self._turn_html_into_doc(conv_res)
            else:
                raise RuntimeError(
                    f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
@ -192,11 +183,81 @@ class VlmPipeline(PaginatedPipeline):
        return conv_res
-    def _turn_md_into_doc(self, conv_res):
+    def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
-        predicted_text = ""
+        doctags_list = []
-        for pg_idx, page in enumerate(conv_res.pages):
+        image_list = []
        for page in conv_res.pages:
            predicted_doctags = ""
            img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
            if page.predictions.vlm_response:
-                predicted_text += page.predictions.vlm_response.text + "\n\n"
+                predicted_doctags = page.predictions.vlm_response.text
            if page.image:
                img = page.image
            image_list.append(img)
            doctags_list.append(predicted_doctags)
        doctags_list_c = cast(List[Union[Path, str]], doctags_list)
        image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
        doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
            doctags_list_c, image_list_c
        )
        conv_res.document = DoclingDocument.load_from_doctags(
            doctag_document=doctags_doc
        )
        # If forced backend text, replace model predicted text with backend one
        if page.size:
            if self.force_backend_text:
                scale = self.pipeline_options.images_scale
                for element, _level in conv_res.document.iterate_items():
                    if not isinstance(element, TextItem) or len(element.prov) == 0:
                        continue
                    crop_bbox = (
                        element.prov[0]
                        .bbox.scaled(scale=scale)
                        .to_top_left_origin(page_height=page.size.height * scale)
                    )
                    txt = self.extract_text_from_backend(page, crop_bbox)
                    element.text = txt
                    element.orig = txt
        return conv_res.document
    def _turn_md_into_doc(self, conv_res):
        def _extract_markdown_code(text):
            """
            Extracts text from markdown code blocks (enclosed in triple backticks).
            If no code blocks are found, returns the original text.
            Args:
                text (str): Input text that may contain markdown code blocks
            Returns:
                str: Extracted code if code blocks exist, otherwise original text
            """
            # Regex pattern to match content between triple backticks
            # This handles multiline content and optional language specifier
            pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
            # Search with DOTALL flag to match across multiple lines
            mtch = re.search(pattern, text, re.DOTALL)
            if mtch:
                # Return only the content of the first capturing group
                return mtch.group(1)
            else:
                # No code blocks found, return original text
                return text
        for pg_idx, page in enumerate(conv_res.pages):
            page_no = pg_idx + 1  # FIXME: might be incorrect
            predicted_text = ""
            if page.predictions.vlm_response:
                predicted_text = page.predictions.vlm_response.text + "\n\n"
            predicted_text = _extract_markdown_code(text=predicted_text)
            response_bytes = BytesIO(predicted_text.encode("utf8"))
            out_doc = InputDocument(
                path_or_stream=response_bytes,
@ -208,7 +269,113 @@ class VlmPipeline(PaginatedPipeline):
                in_doc=out_doc,
                path_or_stream=response_bytes,
            )
-        return backend.convert()
+            page_doc = backend.convert()
            if page.image is not None:
                pg_width = page.image.width
                pg_height = page.image.height
            else:
                pg_width = 1
                pg_height = 1
            conv_res.document.add_page(
                page_no=page_no,
                size=Size(width=pg_width, height=pg_height),
                image=ImageRef.from_pil(image=page.image, dpi=72)
                if page.image
                else None,
            )
            for item, level in page_doc.iterate_items():
                item.prov = [
                    ProvenanceItem(
                        page_no=pg_idx + 1,
                        bbox=BoundingBox(
                            t=0.0, b=0.0, l=0.0, r=0.0
                        ),  # FIXME: would be nice not to have to "fake" it
                        charspan=[0, 0],
                    )
                ]
                conv_res.document.append_child_item(child=item)
        return conv_res.document
    def _turn_html_into_doc(self, conv_res):
        def _extract_html_code(text):
            """
            Extracts text from markdown code blocks (enclosed in triple backticks).
            If no code blocks are found, returns the original text.
            Args:
                text (str): Input text that may contain markdown code blocks
            Returns:
                str: Extracted code if code blocks exist, otherwise original text
            """
            # Regex pattern to match content between triple backticks
            # This handles multiline content and optional language specifier
            pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
            # Search with DOTALL flag to match across multiple lines
            mtch = re.search(pattern, text, re.DOTALL)
            if mtch:
                # Return only the content of the first capturing group
                return mtch.group(1)
            else:
                # No code blocks found, return original text
                return text
        for pg_idx, page in enumerate(conv_res.pages):
            page_no = pg_idx + 1  # FIXME: might be incorrect
            predicted_text = ""
            if page.predictions.vlm_response:
                predicted_text = page.predictions.vlm_response.text + "\n\n"
            predicted_text = _extract_html_code(text=predicted_text)
            response_bytes = BytesIO(predicted_text.encode("utf8"))
            out_doc = InputDocument(
                path_or_stream=response_bytes,
                filename=conv_res.input.file.name,
                format=InputFormat.MD,
                backend=HTMLDocumentBackend,
            )
            backend = HTMLDocumentBackend(
                in_doc=out_doc,
                path_or_stream=response_bytes,
            )
            page_doc = backend.convert()
            if page.image is not None:
                pg_width = page.image.width
                pg_height = page.image.height
            else:
                pg_width = 1
                pg_height = 1
            conv_res.document.add_page(
                page_no=page_no,
                size=Size(width=pg_width, height=pg_height),
                image=ImageRef.from_pil(image=page.image, dpi=72)
                if page.image
                else None,
            )
            for item, level in page_doc.iterate_items():
                item.prov = [
                    ProvenanceItem(
                        page_no=pg_idx + 1,
                        bbox=BoundingBox(
                            t=0.0, b=0.0, l=0.0, r=0.0
                        ),  # FIXME: would be nice not to have to "fake" it
                        charspan=[0, 0],
                    )
                ]
                conv_res.document.append_child_item(child=item)
        return conv_res.document
    @classmethod
    def get_default_options(cls) -> VlmPipelineOptions:
--- a/docling/utils/accelerator_utils.py
+++ b/docling/utils/accelerator_utils.py
@ -1,13 +1,16 @@
 import logging
 from typing import List, Optional
 import torch
-from docling.datamodel.pipeline_options import AcceleratorDevice
+from docling.datamodel.accelerator_options import AcceleratorDevice
 _log = logging.getLogger(__name__)
-def decide_device(accelerator_device: str) -> str:
+def decide_device(
    accelerator_device: str, supported_devices: Optional[List[AcceleratorDevice]] = None
 ) -> str:
    r"""
    Resolve the device based on the acceleration options and the available devices in the system.
@ -20,6 +23,18 @@ def decide_device(accelerator_device: str) -> str:
    has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
    has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
    if supported_devices is not None:
        if has_cuda and AcceleratorDevice.CUDA not in supported_devices:
            _log.info(
                f"Removing CUDA from available devices because it is not in {supported_devices=}"
            )
            has_cuda = False
        if has_mps and AcceleratorDevice.MPS not in supported_devices:
            _log.info(
                f"Removing MPS from available devices because it is not in {supported_devices=}"
            )
            has_mps = False
    if accelerator_device == AcceleratorDevice.AUTO.value:  # Handle 'auto'
        if has_cuda:
            device = "cuda:0"
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@ -4,18 +4,20 @@ from typing import Optional
 from docling.datamodel.pipeline_options import (
    granite_picture_description,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
    smolvlm_picture_description,
 )
 from docling.datamodel.settings import settings
 from docling.datamodel.vlm_model_specs import (
    SMOLDOCLING_MLX,
    SMOLDOCLING_TRANSFORMERS,
 )
 from docling.models.code_formula_model import CodeFormulaModel
 from docling.models.document_picture_classifier import DocumentPictureClassifier
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.models.layout_model import LayoutModel
 from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.utils.hf_model_download import download_hf_model
 _log = logging.getLogger(__name__)
@ -75,7 +77,7 @@ def download_models(
    if with_smolvlm:
        _log.info("Downloading SmolVlm model...")
-        PictureDescriptionVlmModel.download_models(
+        download_hf_model(
            repo_id=smolvlm_picture_description.repo_id,
            local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
            force=force,
@ -84,26 +86,25 @@ def download_models(
    if with_smoldocling:
        _log.info("Downloading SmolDocling model...")
-        HuggingFaceVlmModel.download_models(
+        download_hf_model(
-            repo_id=smoldocling_vlm_conversion_options.repo_id,
+            repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
-            local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
+            local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
            force=force,
            progress=progress,
        )
    if with_smoldocling_mlx:
        _log.info("Downloading SmolDocling MLX model...")
-        HuggingFaceVlmModel.download_models(
+        download_hf_model(
-            repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
+            repo_id=SMOLDOCLING_MLX.repo_id,
-            local_dir=output_dir
+            local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
            / smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
            force=force,
            progress=progress,
        )
    if with_granite_vision:
        _log.info("Downloading Granite Vision model...")
-        PictureDescriptionVlmModel.download_models(
+        download_hf_model(
            repo_id=granite_picture_description.repo_id,
            local_dir=output_dir / granite_picture_description.repo_cache_folder,
            force=force,
--- a/docs/examples/compare_vlm_models.py
+++ b/docs/examples/compare_vlm_models.py
@ -0,0 +1,160 @@
 # Compare VLM models
 # ==================
 #
 # This example runs the VLM pipeline with different vision-language models.
 # Their runtime as well output quality is compared.
 import json
 import sys
 import time
 from pathlib import Path
 from docling_core.types.doc import DocItemLabel, ImageRefMode
 from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
 from tabulate import tabulate
 from docling.datamodel import vlm_model_specs
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
 )
 from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
 def convert(sources: list[Path], converter: DocumentConverter):
    model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
    framework = pipeline_options.vlm_options.inference_framework
    for source in sources:
        print("================================================")
        print("Processing...")
        print(f"Source: {source}")
        print("---")
        print(f"Model: {model_id}")
        print(f"Framework: {framework}")
        print("================================================")
        print("")
        res = converter.convert(source)
        print("")
        fname = f"{res.input.file.stem}-{model_id}-{framework}"
        inference_time = 0.0
        for i, page in enumerate(res.pages):
            inference_time += page.predictions.vlm_response.generation_time
            print("")
            print(
                f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format} in {page.predictions.vlm_response.generation_time} [sec]:"
            )
            print(page.predictions.vlm_response.text)
            print(" ---------- ")
        print("===== Final output of the converted document =======")
        with (out_path / f"{fname}.json").open("w") as fp:
            fp.write(json.dumps(res.document.export_to_dict()))
        res.document.save_as_json(
            out_path / f"{fname}.json",
            image_mode=ImageRefMode.PLACEHOLDER,
        )
        print(f" => produced {out_path / fname}.json")
        res.document.save_as_markdown(
            out_path / f"{fname}.md",
            image_mode=ImageRefMode.PLACEHOLDER,
        )
        print(f" => produced {out_path / fname}.md")
        res.document.save_as_html(
            out_path / f"{fname}.html",
            image_mode=ImageRefMode.EMBEDDED,
            labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
            split_page_view=True,
        )
        print(f" => produced {out_path / fname}.html")
        pg_num = res.document.num_pages()
        print("")
        print(
            f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
        )
        print("====================================================")
        return [
            source,
            model_id,
            str(framework),
            pg_num,
            inference_time,
        ]
 if __name__ == "__main__":
    sources = [
        "tests/data/pdf/2305.03393v1-pg9.pdf",
    ]
    out_path = Path("scratch")
    out_path.mkdir(parents=True, exist_ok=True)
    ## Use VlmPipeline
    pipeline_options = VlmPipelineOptions()
    pipeline_options.generate_page_images = True
    ## On GPU systems, enable flash_attention_2 with CUDA:
    # pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
    # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
    vlm_models = [
        ## DocTags / SmolDocling models
        vlm_model_specs.SMOLDOCLING_MLX,
        vlm_model_specs.SMOLDOCLING_TRANSFORMERS,
        ## Markdown models (using MLX framework)
        vlm_model_specs.QWEN25_VL_3B_MLX,
        vlm_model_specs.PIXTRAL_12B_MLX,
        vlm_model_specs.GEMMA3_12B_MLX,
        ## Markdown models (using Transformers framework)
        vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
        vlm_model_specs.PHI4_TRANSFORMERS,
        vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
    ]
    # Remove MLX models if not on Mac
    if sys.platform != "darwin":
        vlm_models = [
            m for m in vlm_models if m.inference_framework != InferenceFramework.MLX
        ]
    rows = []
    for vlm_options in vlm_models:
        pipeline_options.vlm_options = vlm_options
        ## Set up pipeline for PDF or image inputs
        converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_cls=VlmPipeline,
                    pipeline_options=pipeline_options,
                ),
                InputFormat.IMAGE: PdfFormatOption(
                    pipeline_cls=VlmPipeline,
                    pipeline_options=pipeline_options,
                ),
            },
        )
        row = convert(sources=sources, converter=converter)
        rows.append(row)
        print(
            tabulate(
                rows, headers=["source", "model_id", "framework", "num_pages", "time"]
            )
        )
        print("see if memory gets released ...")
        time.sleep(10)
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -3,10 +3,9 @@ import logging
 import time
 from pathlib import Path
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -1,101 +1,46 @@
-import json
+from docling.datamodel import vlm_model_specs
 import time
 from pathlib import Path
 from docling_core.types.doc import DocItemLabel, ImageRefMode
 from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
-sources = [
+source = "https://arxiv.org/pdf/2501.17887"
    # "tests/data/2305.03393v1-pg9-img.png",
    "tests/data/pdf/2305.03393v1-pg9.pdf",
 ]
-## Use experimental VlmPipeline
+###### USING SIMPLE DEFAULT VALUES
-pipeline_options = VlmPipelineOptions()
+# - SmolDocling model
-# If force_backend_text = True, text from backend will be used instead of generated text
+# - Using the transformers framework
 pipeline_options.force_backend_text = False
-## On GPU systems, enable flash_attention_2 with CUDA:
+converter = DocumentConverter(
-# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
+    format_options={
-# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
+        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VlmPipeline,
        ),
    }
 )
-## Pick a VLM model. We choose SmolDocling-256M by default
+doc = converter.convert(source=source).document
 # pipeline_options.vlm_options = smoldocling_vlm_conversion_options
-## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
+print(doc.export_to_markdown())
 pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
 ## Alternative VLM models:
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options
-## Set up pipeline for PDF or image inputs
+###### USING MACOS MPS ACCELERATOR
 # For more options see the compare_vlm_models.py example.
 pipeline_options = VlmPipelineOptions(
    vlm_options=vlm_model_specs.SMOLDOCLING_MLX,
 )
 converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
        InputFormat.IMAGE: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
    }
 )
-out_path = Path("scratch")
+doc = converter.convert(source=source).document
 out_path.mkdir(parents=True, exist_ok=True)
-for source in sources:
+print(doc.export_to_markdown())
    start_time = time.time()
    print("================================================")
    print(f"Processing... {source}")
    print("================================================")
    print("")
    res = converter.convert(source)
    print("")
    print(res.document.export_to_markdown())
    for page in res.pages:
        print("")
        print("Predicted page in DOCTAGS:")
        print(page.predictions.vlm_response.text)
    res.document.save_as_html(
        filename=Path(f"{out_path}/{res.input.file.stem}.html"),
        image_mode=ImageRefMode.REFERENCED,
        labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
    )
    with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
        fp.write(json.dumps(res.document.export_to_dict()))
    res.document.save_as_json(
        out_path / f"{res.input.file.stem}.json",
        image_mode=ImageRefMode.PLACEHOLDER,
    )
    res.document.save_as_markdown(
        out_path / f"{res.input.file.stem}.md",
        image_mode=ImageRefMode.PLACEHOLDER,
    )
    pg_num = res.document.num_pages()
    print("")
    inference_time = time.time() - start_time
    print(
        f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
    )
 print("================================================")
 print("done!")
 print("================================================")
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@ -1,9 +1,8 @@
 from pathlib import Path
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
 )
 from docling.datamodel.settings import settings
--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@ -1,5 +1,4 @@
 import logging
 import time
 from pathlib import Path
 from docling_core.types.doc import ImageRefMode, TableItem, TextItem
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@ -7,10 +7,9 @@ from dotenv import load_dotenv
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    ApiVlmOptions,
    ResponseFormat,
    VlmPipelineOptions,
 )
 from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@ -44,6 +44,23 @@ This is a collection of FAQ collected from the user questions on <https://github
    Source: Issue [#283](https://github.com/docling-project/docling/issues/283#issuecomment-2465035868)
 ??? question "Is macOS x86_64 supported?"
    ### Is macOS x86_64 supported?
    Yes, Docling (still) supports running the standard pipeline on macOS x86_64.
    However, users might get into a combination of incompatible dependencies on a fresh install.
    Because Docling depends on PyTorch which dropped support for macOS x86_64 after the 2.2.2 release,
    and this old version of PyTorch works only with NumPy 1.x, users **must** ensure the correct NumPy version is running.
    ```shell
    pip install docling "numpy<2.0.0"
    ```
    Source: Issue [#1694](https://github.com/docling-project/docling/issues/1694).
 ??? question "Are text styles (bold, underline, etc) supported?"
    ### Are text styles (bold, underline, etc) supported?
@ -177,3 +194,38 @@ This is a collection of FAQ collected from the user questions on <https://github
    Also see [docling#725](https://github.com/docling-project/docling/issues/725).
    Source: Issue [docling-core#119](https://github.com/docling-project/docling-core/issues/119)
 ??? question "How to use flash attention?"
    ### How to use flash attention?
    When running models in Docling on CUDA devices, you can enable the usage of the Flash Attention2 library.
    Using environment variables:
    ```
    DOCLING_CUDA_USE_FLASH_ATTENTION2=1
    ```
    Using code:
    ```python
    from docling.datamodel.accelerator_options import (
        AcceleratorOptions,
    )
    pipeline_options = VlmPipelineOptions(
        accelerator_options=AcceleratorOptions(cuda_use_flash_attention2=True)
    )
    ```
    This requires having the [flash-attn](https://pypi.org/project/flash-attn/) package installed. Below are two alternative ways for installing it:
    ```shell
    # Building from sources (required the CUDA dev environment)
    pip install flash-attn
    # Using pre-built wheels (not available in all possible setups)
    FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn
    ```
--- a/docs/index.md
+++ b/docs/index.md
@ -6,13 +6,13 @@
 [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
-[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
+[![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
 [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
 [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
 [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
 [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
 [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
@ -27,7 +27,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 🔒 Local execution capabilities for sensitive data and air-gapped environments
 * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕🔥
+* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🔥
 * 💻 Simple and convenient CLI
 ### Coming soon
@ -39,7 +39,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ## Get started
 <div class="grid">
-  <a href="concepts/" class="card"><b>Concepts</b><br />Learn Docling fundamendals</a>
+  <a href="concepts/" class="card"><b>Concepts</b><br />Learn Docling fundamentals</a>
  <a href="examples/" class="card"><b>Examples</b><br />Try out recipes for various use cases, including conversion, RAG, and more</a>
  <a href="integrations/" class="card"><b>Integrations</b><br />Check out integrations with popular frameworks and tools</a>
  <a href="reference/document_converter/" class="card"><b>Reference</b><br />See more API details</a>
--- a/docs/installation/index.md
+++ b/docs/installation/index.md
@ -129,5 +129,5 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
 To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:
 ```bash
-poetry install --all-extras
+uv sync --all-extras
 ```
--- a/docs/usage/vision_models.md
+++ b/docs/usage/vision_models.md
@ -0,0 +1,121 @@
 The `VlmPipeline` in Docling allows to convert documents end-to-end using a vision-language model.
 Docling supports vision-language models which output:
 - DocTags (e.g. [SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)), the preferred choice
 - Markdown
 - HTML
 For running Docling using local models with the `VlmPipeline`:
 === "CLI"
    ```bash
    docling --pipeline vlm FILE
    ```
 === "Python"
    See also the example [minimal_vlm_pipeline.py](./../examples/minimal_vlm_pipeline.py).
    ```python
    from docling.datamodel.base_models import InputFormat
    from docling.document_converter import DocumentConverter, PdfFormatOption
    from docling.pipeline.vlm_pipeline import VlmPipeline
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=VlmPipeline,
            ),
        }
    )
    doc = converter.convert(source="FILE").document
    ```
 ## Available local models
 By default, the vision-language models are running locally.
 Docling allows to choose between the Hugging Face [Transformers](https://github.com/huggingface/transformers) framweork and the [MLX](https://github.com/Blaizzy/mlx-vlm) (for Apple devices with MPS acceleration) one.
 The following table reports the models currently available out-of-the-box.
 | Model instance | Model | Framework | Device | Num pages | Inference time (sec) |
 | ---------------|------ | --------- | ------ | --------- | ---------------------|
 | `vlm_model_specs.SMOLDOCLING_TRANSFORMERS` | [ds4sd/SmolDocling-256M-preview](https://huggingface.co/ds4sd/SmolDocling-256M-preview) | `Transformers/AutoModelForVision2Seq` | MPS | 1 |  102.212 |
 | `vlm_model_specs.SMOLDOCLING_MLX` | [ds4sd/SmolDocling-256M-preview-mlx-bf16](https://huggingface.co/ds4sd/SmolDocling-256M-preview-mlx-bf16) | `MLX`| MPS | 1 |    6.15453 |
 | `vlm_model_specs.QWEN25_VL_3B_MLX` | [mlx-community/Qwen2.5-VL-3B-Instruct-bf16](https://huggingface.co/mlx-community/Qwen2.5-VL-3B-Instruct-bf16)  |  `MLX`| MPS | 1 |   23.4951 |
 | `vlm_model_specs.PIXTRAL_12B_MLX` | [mlx-community/pixtral-12b-bf16](https://huggingface.co/mlx-community/pixtral-12b-bf16) |  `MLX` | MPS | 1 |  308.856 |
 | `vlm_model_specs.GEMMA3_12B_MLX` | [mlx-community/gemma-3-12b-it-bf16](https://huggingface.co/mlx-community/gemma-3-12b-it-bf16) |  `MLX` | MPS | 1 |  378.486 |
 | `vlm_model_specs.GRANITE_VISION_TRANSFORMERS` | [ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b) | `Transformers/AutoModelForVision2Seq` | MPS | 1 |  104.75 |
 | `vlm_model_specs.PHI4_TRANSFORMERS` | [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) | `Transformers/AutoModelForCasualLM` | CPU | 1 | 1175.67 |
 | `vlm_model_specs.PIXTRAL_12B_TRANSFORMERS` | [mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b) | `Transformers/AutoModelForVision2Seq` | CPU | 1 | 1828.21 |
 _Inference time is computed on a Macbook M3 Max using the example page `tests/data/pdf/2305.03393v1-pg9.pdf`. The comparision is done with the example [compare_vlm_models.py](./../examples/compare_vlm_models.py)._
 For choosing the model, the code snippet above can be extended as follow
 ```python
 from docling.datamodel.base_models import InputFormat
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
 )
 from docling.datamodel import vlm_model_specs
 pipeline_options = VlmPipelineOptions(
    vlm_options=vlm_model_specs.SMOLDOCLING_MLX,  # <-- change the model here
 )
 converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
    }
 )
 doc = converter.convert(source="FILE").document
 ```
 ### Other models
 Other models can be configured by directly providing the Hugging Face `repo_id`, the prompt and a few more options.
 For example:
 ```python
 from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions, InferenceFramework, TransformersModelType
 pipeline_options = VlmPipelineOptions(
    vlm_options=InlineVlmOptions(
        repo_id="ibm-granite/granite-vision-3.2-2b",
        prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
        response_format=ResponseFormat.MARKDOWN,
        inference_framework=InferenceFramework.TRANSFORMERS,
        transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
        supported_devices=[
            AcceleratorDevice.CPU,
            AcceleratorDevice.CUDA,
            AcceleratorDevice.MPS,
        ],
        scale=2.0,
        temperature=0.0,
    )
 )
 ```
 ## Remote models
 Additionally to local models, the `VlmPipeline` allows to offload the inference to a remote service hosting the models.
 Many remote inference services are provided, the key requirement is to offer an OpenAI-compatible API. This includes vLLM, Ollama, etc.
 More examples on how to connect with the remote inference services can be found in the following examples:
 - [vlm_pipeline_api_model.py](./../examples/vlm_pipeline_api_model.py)
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -60,6 +60,7 @@ nav:
      - Usage: usage/index.md
      - Supported formats: usage/supported_formats.md
      - Enrichment features: usage/enrichments.md
      - Vision models: usage/vision_models.md
    - FAQ:
      - FAQ: faq/index.md
  - Concepts:
@ -78,6 +79,7 @@ nav:
      - "Multi-format conversion": examples/run_with_formats.py
      - "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
      - "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
      - "VLM comparison": examples/compare_vlm_models.py
      - "Figure export": examples/export_figures.py
      - "Table export": examples/export_tables.py
      - "Multimodal export": examples/export_multimodal.py
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,20 +1,8 @@
-[tool.poetry]
+[project]
 name = "docling"
-version = "2.34.0"  # DO NOT EDIT, updated automatically
+version = "2.36.1"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = [
  "Christoph Auer <cau@zurich.ibm.com>",
  "Michele Dolfi <dol@zurich.ibm.com>",
  "Maxim Lysak <mly@zurich.ibm.com>",
  "Nikos Livathinos <nli@zurich.ibm.com>",
  "Ahmed Nassar <ahn@zurich.ibm.com>",
  "Panos Vagenas <pva@zurich.ibm.com>",
  "Peter Staar <taa@zurich.ibm.com>",
 ]
 license = "MIT"
 readme = "README.md"
 repository = "https://github.com/docling-project/docling"
 homepage = "https://github.com/docling-project/docling"
 keywords = [
  "docling",
  "convert",
@ -29,144 +17,136 @@ keywords = [
  "table former",
 ]
 classifiers = [
  "License :: OSI Approved :: MIT License",
  "Operating System :: MacOS :: MacOS X",
  "Operating System :: POSIX :: Linux",
  "Operating System :: Microsoft :: Windows",
  "Development Status :: 5 - Production/Stable",
  "Intended Audience :: Developers",
  "Intended Audience :: Science/Research",
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
  "Programming Language :: Python :: 3",
  "Programming Language :: Python :: 3.9",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12",
  "Programming Language :: Python :: 3.13",
 ]
-packages = [{ include = "docling" }]
+readme = "README.md"
-
+authors = [
-[tool.poetry.dependencies]
+  { name = "Christoph Auer", email = "cau@zurich.ibm.com" },
-######################
+  { name = "Michele Dolfi", email = "dol@zurich.ibm.com" },
-# actual dependencies:
+  { name = "Maxim Lysak", email = "mly@zurich.ibm.com" },
-######################
+  { name = "Nikos Livathinos", email = "nli@zurich.ibm.com" },
-python = "^3.9"
+  { name = "Ahmed Nassar", email = "ahn@zurich.ibm.com" },
-pydantic = "^2.0.0"
+  { name = "Panos Vagenas", email = "pva@zurich.ibm.com" },
-docling-core = {version = "^2.31.2", extras = ["chunking"]}
+  { name = "Peter Staar", email = "taa@zurich.ibm.com" },
 docling-ibm-models = "^3.4.0"
 docling-parse = "^4.0.0"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
 pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
 requests = "^2.32.2"
 easyocr = "^1.7"
 tesserocr = { version = "^2.7.1", optional = true }
 certifi = ">=2024.7.4"
 rtree = "^1.3.0"
 scipy = [
  { version = "^1.6.0", markers = "python_version >= '3.10'" },
  { version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" },
 ]
-typer = ">=0.12.5,<0.16.0"
+requires-python = '>=3.9,<4.0'
-python-docx = "^1.1.2"
+dependencies = [
-python-pptx = "^1.0.2"
+  'pydantic (>=2.0.0,<3.0.0)',
-beautifulsoup4 = "^4.12.3"
+  'docling-core[chunking] (>=2.29.0,<3.0.0)',
-pandas = "^2.1.4"
+  'docling-ibm-models (>=3.4.4,<4.0.0)',
-marko = "^2.1.2"
+  'docling-parse (>=4.0.0,<5.0.0)',
-openpyxl = "^3.1.5"
+  'filetype (>=1.2.0,<2.0.0)',
-lxml = ">=4.0.0,<6.0.0"
+  'pypdfium2 (>=4.30.0,<5.0.0)',
-ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
+  'pydantic-settings (>=2.3.0,<3.0.0)',
-rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
+  'huggingface_hub (>=0.23,<1)',
-onnxruntime = [
+  'requests (>=2.32.2,<3.0.0)',
-  # 1.19.2 is the last version with python3.9 support,
+  'easyocr (>=1.7,<2.0)',
-  # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
+  'certifi (>=2024.7.4)',
-  { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
+  'rtree (>=1.3.0,<2.0.0)',
-  { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" },
+  'typer (>=0.12.5,<0.17.0)',
  'python-docx (>=1.1.2,<2.0.0)',
  'python-pptx (>=1.0.2,<2.0.0)',
  'beautifulsoup4 (>=4.12.3,<5.0.0)',
  'pandas (>=2.1.4,<3.0.0)',
  'marko (>=2.1.2,<3.0.0)',
  'openpyxl (>=3.1.5,<4.0.0)',
  'lxml (>=4.0.0,<6.0.0)',
  'pillow (>=10.0.0,<12.0.0)',
  'tqdm (>=4.65.0,<5.0.0)',
  'pluggy (>=1.0.0,<2.0.0)',
  'pylatexenc (>=2.10,<3.0)',
  'scipy (>=1.6.0,<2.0.0)',
  # 'scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"',
  # 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
 ]
-transformers = [
+[project.urls]
-  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
+homepage = "https://github.com/docling-project/docling"
-  { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true },
+repository = "https://github.com/docling-project/docling"
-]
+issues = "https://github.com/docling-project/docling/issues"
-accelerate = [
+changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md"
  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
 ]
 pillow = ">=10.0.0,<12.0.0"
 tqdm = "^4.65.0"
 pluggy = "^1.0.0"
 pylatexenc = "^2.10"
 click = "<8.2.0"
-[tool.poetry.group.dev.dependencies]
+[project.entry-points.docling]
-python = "^3.9.2"
+"docling_defaults" = "docling.models.plugins.defaults"
 black = { extras = ["jupyter"], version = "^24.4.2" }
 pytest = "^7.2.2"
 pre-commit = "^3.7.1"
 mypy = "^1.10.1"
 isort = "^5.10.1"
 python-semantic-release = "^7.32.2"
 flake8 = "^6.0.0"
 pyproject-flake8 = "^6.0.0"
 pytest-xdist = "^3.3.1"
 types-requests = "^2.31.0.2"
 flake8-pyproject = "^1.2.3"
 pylint = "^2.17.5"
 pandas-stubs = "^2.1.4.231227"
 ipykernel = "^6.29.5"
 ipywidgets = "^8.1.5"
 nbqa = "^1.9.0"
 types-openpyxl = "^3.1.5.20241114"
 types-tqdm = "^4.67.0.20241221"
 coverage = "^7.6.2"
 pytest-cov = "^6.0.0"
-[tool.poetry.group.docs.dependencies]
+[project.scripts]
 mkdocs-material = "^9.5.40"
 mkdocs-jupyter = "^0.25.0"
 mkdocs-click = "^0.8.1"
 mkdocstrings = { extras = ["python"], version = "^0.27.0" }
 griffe-pydantic = "^1.1.0"
 [tool.poetry.group.examples.dependencies]
 datasets = "^2.21.0"
 python-dotenv = "^1.0.1"
 langchain-huggingface = "^0.0.3"
 langchain-milvus = "^0.1.4"
 langchain-text-splitters = "^0.2.4"
 [tool.poetry.group.constraints]
 optional = true
 [tool.poetry.group.constraints.dependencies]
 numpy = [
  { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
  { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
 ]
 [tool.poetry.group.mac_intel]
 optional = true
 [tool.poetry.group.mac_intel.dependencies]
 torch = [
  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2" },
  { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2" },
 ]
 torchvision = [
  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0" },
  { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2" },
 ]
 [tool.poetry.extras]
 tesserocr = ["tesserocr"]
 ocrmac = ["ocrmac"]
 vlm = ["transformers", "accelerate"]
 rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
 [tool.poetry.scripts]
 docling = "docling.cli.main:app"
 docling-tools = "docling.cli.tools:app"
-[tool.poetry.plugins."docling"]
+[project.optional-dependencies]
-"docling_defaults" = "docling.models.plugins.defaults"
+tesserocr = ['tesserocr (>=2.7.1,<3.0.0)']
 ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"']
 vlm = [
  'transformers (>=4.46.0,<5.0.0)',
  'accelerate (>=1.2.1,<2.0.0)',
  'mlx-vlm >=0.1.22 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
 ]
 rapidocr = [
  'rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; python_version < "3.13"',
  'onnxruntime (>=1.7.0,<2.0.0)',
  # 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
  # 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
 ]
-[build-system]
+[dependency-groups]
-requires = ["poetry-core"]
+dev = [
-build-backend = "poetry.core.masonry.api"
+    "pre-commit~=3.7",
    "mypy~=1.10",
    "types-setuptools~=70.3",
    "pandas-stubs~=2.1",
    "types-openpyxl~=3.1",
    "types-requests~=2.31",
    "boto3-stubs~=1.37",
    "types-urllib3~=1.26",
    "types-tqdm~=4.67",
    "coverage~=7.6",
    "pytest~=8.3",
    "pytest-cov>=6.1.1",
    "pytest-dependency~=0.6",
    "pytest-xdist~=3.3",
    "ipykernel~=6.29",
    "ipywidgets~=8.1",
    "nbqa~=1.9",
    "python-semantic-release~=7.32",
 ]
 docs = [
  "mkdocs-material~=9.5",
  "mkdocs-jupyter~=0.25",
  "mkdocs-click~=0.8",
  "mkdocstrings[python]~=0.27",
  "griffe-pydantic~=1.1",
 ]
 examples = [
  "datasets~=2.21",
  "python-dotenv~=1.0",
  "langchain-huggingface>=0.0.3",
  "langchain-milvus~=0.1",
  "langchain-text-splitters~=0.2",
 ]
 constraints = [
  'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
  'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
 ]
 [tool.uv]
 package = true
 default-groups = "all"
 [tool.setuptools.packages.find]
 include = ["docling*"]
 [tool.ruff]
 target-version = "py39"
--- a/tests/data/docx/textbox.docx
+++ b/tests/data/docx/textbox.docx
--- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt
@ -26,69 +26,71 @@ item-0 at level 0: unspecified: group _root_
  item-21 at level 1: paragraph: 
  item-22 at level 1: paragraph: 
  item-23 at level 1: section: group textbox
-    item-24 at level 2: paragraph:   A report must be submitted wi ... saster Prevention Information Network.
+    item-24 at level 2: list: group list
-    item-25 at level 2: paragraph:   A report must also be submitt ... d Infectious Disease Reporting System.
+      item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
-    item-26 at level 2: paragraph: 
+      item-26 at level 3: list_item: A report must also be submitted  ... d Infectious Disease Reporting System.
    item-27 at level 2: paragraph: 
-  item-28 at level 1: paragraph: 
+    item-28 at level 2: paragraph: 
-  item-29 at level 1: paragraph: 
+  item-29 at level 1: list: group list
-  item-30 at level 1: paragraph: 
+    item-30 at level 2: list_item: 
  item-31 at level 1: paragraph: 
  item-32 at level 1: paragraph: 
  item-33 at level 1: paragraph: 
-  item-34 at level 1: section: group textbox
+  item-34 at level 1: paragraph: 
-    item-35 at level 2: paragraph: Health Bureau:
+  item-35 at level 1: paragraph: 
-    item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
+  item-36 at level 1: section: group textbox
-    item-37 at level 2: list: group list
+    item-37 at level 2: paragraph: Health Bureau:
-      item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
+    item-38 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
-      item-39 at level 3: list_item: Implement appropriate epidemic p ...  the Communicable Disease Control Act.
+    item-39 at level 2: list: group list
-    item-40 at level 2: paragraph: 
+      item-40 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
-    item-41 at level 2: paragraph: 
+      item-41 at level 3: list_item: Implement appropriate epidemic p ...  the Communicable Disease Control Act.
-  item-42 at level 1: list: group list
+    item-42 at level 2: paragraph: 
-    item-43 at level 2: list_item: 
+    item-43 at level 2: paragraph: 
-  item-44 at level 1: paragraph: 
+  item-44 at level 1: list: group list
-  item-45 at level 1: section: group textbox
+    item-45 at level 2: list_item: 
-    item-46 at level 2: paragraph: Department of Education:
+  item-46 at level 1: paragraph: 
  item-47 at level 1: section: group textbox
    item-48 at level 2: paragraph: Department of Education:
 Collabo ... vention measures at all school levels.
  item-47 at level 1: paragraph: 
  item-48 at level 1: paragraph: 
  item-49 at level 1: paragraph: 
  item-50 at level 1: paragraph: 
  item-51 at level 1: paragraph: 
  item-52 at level 1: paragraph: 
  item-53 at level 1: paragraph: 
-  item-54 at level 1: section: group textbox
+  item-54 at level 1: paragraph: 
-    item-55 at level 2: inline: group group
+  item-55 at level 1: paragraph: 
-      item-56 at level 3: paragraph: The Health Bureau will handle
+  item-56 at level 1: section: group textbox
-      item-57 at level 3: paragraph: reporting and specimen collection
+    item-57 at level 2: inline: group group
-      item-58 at level 3: paragraph: .
+      item-58 at level 3: paragraph: The Health Bureau will handle
-    item-59 at level 2: paragraph: 
+      item-59 at level 3: paragraph: reporting and specimen collection
-    item-60 at level 2: paragraph: 
+      item-60 at level 3: paragraph: .
-  item-61 at level 1: paragraph: 
+    item-61 at level 2: paragraph: 
-  item-62 at level 1: paragraph: 
+    item-62 at level 2: paragraph: 
  item-63 at level 1: paragraph: 
-  item-64 at level 1: section: group textbox
+  item-64 at level 1: paragraph: 
-    item-65 at level 2: paragraph: Whether the epidemic has eased.
+  item-65 at level 1: paragraph: 
-    item-66 at level 2: paragraph: 
+  item-66 at level 1: section: group textbox
-    item-67 at level 2: paragraph: 
+    item-67 at level 2: paragraph: Whether the epidemic has eased.
-  item-68 at level 1: paragraph: 
+    item-68 at level 2: paragraph: 
-  item-69 at level 1: section: group textbox
+    item-69 at level 2: paragraph: 
-    item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
+  item-70 at level 1: paragraph: 
-    item-71 at level 2: paragraph: No
+  item-71 at level 1: section: group textbox
-  item-72 at level 1: paragraph: 
+    item-72 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
-  item-73 at level 1: paragraph: 
+    item-73 at level 2: paragraph: No
-  item-74 at level 1: section: group textbox
+  item-74 at level 1: paragraph: 
  item-75 at level 1: paragraph: 
  item-76 at level 1: section: group textbox
  item-77 at level 1: paragraph: 
-  item-78 at level 1: paragraph: 
+  item-78 at level 1: section: group textbox
-  item-79 at level 1: section: group textbox
+  item-79 at level 1: paragraph: 
-    item-80 at level 2: paragraph: Case closed.
+  item-80 at level 1: paragraph: 
-    item-81 at level 2: paragraph: 
+  item-81 at level 1: section: group textbox
-    item-82 at level 2: paragraph: 
+    item-82 at level 2: paragraph: Case closed.
-    item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
+    item-83 at level 2: paragraph: 
-  item-84 at level 1: paragraph: 
+    item-84 at level 2: paragraph: 
-  item-85 at level 1: section: group textbox
+    item-85 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
  item-86 at level 1: paragraph: 
-  item-87 at level 1: paragraph: 
+  item-87 at level 1: section: group textbox
  item-88 at level 1: paragraph: 
  item-89 at level 1: paragraph: 
  item-90 at level 1: paragraph: 
--- a/tests/data/groundtruth/docling_v2/textbox.docx.json
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.json
@ -4,7 +4,7 @@
  "name": "textbox",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-    "binary_hash": 830302052279341882,
+    "binary_hash": 11723995438039370060,
    "filename": "textbox.docx"
  },
  "furniture": {
@ -66,7 +66,7 @@
        "$ref": "#/groups/4"
      },
      {
-        "$ref": "#/texts/22"
+        "$ref": "#/groups/6"
      },
      {
        "$ref": "#/texts/23"
@ -84,16 +84,16 @@
        "$ref": "#/texts/27"
      },
      {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      {
-        "$ref": "#/groups/7"
+        "$ref": "#/groups/9"
      },
      {
        "$ref": "#/texts/35"
      },
      {
-        "$ref": "#/groups/8"
+        "$ref": "#/groups/10"
      },
      {
        "$ref": "#/texts/37"
@ -117,7 +117,7 @@
        "$ref": "#/texts/43"
      },
      {
-        "$ref": "#/groups/9"
+        "$ref": "#/groups/11"
      },
      {
        "$ref": "#/texts/49"
@ -129,13 +129,13 @@
        "$ref": "#/texts/51"
      },
      {
-        "$ref": "#/groups/11"
+        "$ref": "#/groups/13"
      },
      {
        "$ref": "#/texts/55"
      },
      {
-        "$ref": "#/groups/12"
+        "$ref": "#/groups/14"
      },
      {
        "$ref": "#/texts/58"
@ -144,13 +144,13 @@
        "$ref": "#/texts/59"
      },
      {
-        "$ref": "#/groups/13"
+        "$ref": "#/groups/15"
      },
      {
        "$ref": "#/texts/60"
      },
      {
-        "$ref": "#/groups/14"
+        "$ref": "#/groups/16"
      },
      {
        "$ref": "#/texts/61"
@ -159,13 +159,13 @@
        "$ref": "#/texts/62"
      },
      {
-        "$ref": "#/groups/15"
+        "$ref": "#/groups/17"
      },
      {
        "$ref": "#/texts/67"
      },
      {
-        "$ref": "#/groups/16"
+        "$ref": "#/groups/18"
      },
      {
        "$ref": "#/texts/68"
@ -254,10 +254,7 @@
      },
      "children": [
        {
-          "$ref": "#/texts/18"
+          "$ref": "#/groups/5"
        },
        {
          "$ref": "#/texts/19"
        },
        {
          "$ref": "#/texts/20"
@ -272,6 +269,37 @@
    },
    {
      "self_ref": "#/groups/5",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [
        {
          "$ref": "#/texts/18"
        },
        {
          "$ref": "#/texts/19"
        }
      ],
      "content_layer": "body",
      "name": "list",
      "label": "list"
    },
    {
      "self_ref": "#/groups/6",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/22"
        }
      ],
      "content_layer": "body",
      "name": "list",
      "label": "list"
    },
    {
      "self_ref": "#/groups/7",
      "parent": {
        "$ref": "#/body"
      },
@ -283,7 +311,7 @@
          "$ref": "#/texts/29"
        },
        {
-          "$ref": "#/groups/6"
+          "$ref": "#/groups/8"
        },
        {
          "$ref": "#/texts/32"
@ -297,9 +325,9 @@
      "label": "section"
    },
    {
-      "self_ref": "#/groups/6",
+      "self_ref": "#/groups/8",
      "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      "children": [
        {
@ -314,7 +342,7 @@
      "label": "list"
    },
    {
-      "self_ref": "#/groups/7",
+      "self_ref": "#/groups/9",
      "parent": {
        "$ref": "#/body"
      },
@ -328,7 +356,7 @@
      "label": "list"
    },
    {
-      "self_ref": "#/groups/8",
+      "self_ref": "#/groups/10",
      "parent": {
        "$ref": "#/body"
      },
@ -342,13 +370,13 @@
      "label": "section"
    },
    {
-      "self_ref": "#/groups/9",
+      "self_ref": "#/groups/11",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
-          "$ref": "#/groups/10"
+          "$ref": "#/groups/12"
        },
        {
          "$ref": "#/texts/47"
@ -362,9 +390,9 @@
      "label": "section"
    },
    {
-      "self_ref": "#/groups/10",
+      "self_ref": "#/groups/12",
      "parent": {
-        "$ref": "#/groups/9"
+        "$ref": "#/groups/11"
      },
      "children": [
        {
@ -382,7 +410,7 @@
      "label": "inline"
    },
    {
-      "self_ref": "#/groups/11",
+      "self_ref": "#/groups/13",
      "parent": {
        "$ref": "#/body"
      },
@ -402,7 +430,7 @@
      "label": "section"
    },
    {
-      "self_ref": "#/groups/12",
+      "self_ref": "#/groups/14",
      "parent": {
        "$ref": "#/body"
      },
@ -418,31 +446,31 @@
      "name": "textbox",
      "label": "section"
    },
    {
      "self_ref": "#/groups/13",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "name": "textbox",
      "label": "section"
    },
    {
      "self_ref": "#/groups/14",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "name": "textbox",
      "label": "section"
    },
    {
      "self_ref": "#/groups/15",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "name": "textbox",
      "label": "section"
    },
    {
      "self_ref": "#/groups/16",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "name": "textbox",
      "label": "section"
    },
    {
      "self_ref": "#/groups/17",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/63"
@ -462,7 +490,7 @@
      "label": "section"
    },
    {
-      "self_ref": "#/groups/16",
+      "self_ref": "#/groups/18",
      "parent": {
        "$ref": "#/body"
      },
@ -732,38 +760,42 @@
    {
      "self_ref": "#/texts/18",
      "parent": {
-        "$ref": "#/groups/4"
+        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "list_item",
      "prov": [],
-      "orig": "  A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
+      "orig": "A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
-      "text": "  A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
+      "text": "A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false
-      }
+      },
      "enumerated": false,
      "marker": "-"
    },
    {
      "self_ref": "#/texts/19",
      "parent": {
-        "$ref": "#/groups/4"
+        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "list_item",
      "prov": [],
-      "orig": "  A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
+      "orig": "A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
-      "text": "  A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
+      "text": "A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false
-      }
+      },
      "enumerated": false,
      "marker": "-"
    },
    {
      "self_ref": "#/texts/20",
@ -792,14 +824,16 @@
    {
      "self_ref": "#/texts/22",
      "parent": {
-        "$ref": "#/body"
+        "$ref": "#/groups/6"
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "list_item",
      "prov": [],
      "orig": "",
-      "text": ""
+      "text": "",
      "enumerated": false,
      "marker": "-"
    },
    {
      "self_ref": "#/texts/23",
@ -864,7 +898,7 @@
    {
      "self_ref": "#/texts/28",
      "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
@ -882,7 +916,7 @@
    {
      "self_ref": "#/texts/29",
      "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
@ -900,7 +934,7 @@
    {
      "self_ref": "#/texts/30",
      "parent": {
-        "$ref": "#/groups/6"
+        "$ref": "#/groups/8"
      },
      "children": [],
      "content_layer": "body",
@ -920,7 +954,7 @@
    {
      "self_ref": "#/texts/31",
      "parent": {
-        "$ref": "#/groups/6"
+        "$ref": "#/groups/8"
      },
      "children": [],
      "content_layer": "body",
@ -940,7 +974,7 @@
    {
      "self_ref": "#/texts/32",
      "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
@ -952,7 +986,7 @@
    {
      "self_ref": "#/texts/33",
      "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
@ -964,7 +998,7 @@
    {
      "self_ref": "#/texts/34",
      "parent": {
-        "$ref": "#/groups/7"
+        "$ref": "#/groups/9"
      },
      "children": [],
      "content_layer": "body",
@ -990,7 +1024,7 @@
    {
      "self_ref": "#/texts/36",
      "parent": {
-        "$ref": "#/groups/8"
+        "$ref": "#/groups/10"
      },
      "children": [],
      "content_layer": "body",
@ -1092,7 +1126,7 @@
    {
      "self_ref": "#/texts/44",
      "parent": {
-        "$ref": "#/groups/10"
+        "$ref": "#/groups/12"
      },
      "children": [],
      "content_layer": "body",
@ -1110,7 +1144,7 @@
    {
      "self_ref": "#/texts/45",
      "parent": {
-        "$ref": "#/groups/10"
+        "$ref": "#/groups/12"
      },
      "children": [],
      "content_layer": "body",
@ -1128,7 +1162,7 @@
    {
      "self_ref": "#/texts/46",
      "parent": {
-        "$ref": "#/groups/10"
+        "$ref": "#/groups/12"
      },
      "children": [],
      "content_layer": "body",
@ -1146,7 +1180,7 @@
    {
      "self_ref": "#/texts/47",
      "parent": {
-        "$ref": "#/groups/9"
+        "$ref": "#/groups/11"
      },
      "children": [],
      "content_layer": "body",
@ -1158,7 +1192,7 @@
    {
      "self_ref": "#/texts/48",
      "parent": {
-        "$ref": "#/groups/9"
+        "$ref": "#/groups/11"
      },
      "children": [],
      "content_layer": "body",
@ -1206,7 +1240,7 @@
    {
      "self_ref": "#/texts/52",
      "parent": {
-        "$ref": "#/groups/11"
+        "$ref": "#/groups/13"
      },
      "children": [],
      "content_layer": "body",
@ -1224,7 +1258,7 @@
    {
      "self_ref": "#/texts/53",
      "parent": {
-        "$ref": "#/groups/11"
+        "$ref": "#/groups/13"
      },
      "children": [],
      "content_layer": "body",
@ -1236,7 +1270,7 @@
    {
      "self_ref": "#/texts/54",
      "parent": {
-        "$ref": "#/groups/11"
+        "$ref": "#/groups/13"
      },
      "children": [],
      "content_layer": "body",
@ -1260,7 +1294,7 @@
    {
      "self_ref": "#/texts/56",
      "parent": {
-        "$ref": "#/groups/12"
+        "$ref": "#/groups/14"
      },
      "children": [],
      "content_layer": "body",
@ -1278,7 +1312,7 @@
    {
      "self_ref": "#/texts/57",
      "parent": {
-        "$ref": "#/groups/12"
+        "$ref": "#/groups/14"
      },
      "children": [],
      "content_layer": "body",
@ -1356,7 +1390,7 @@
    {
      "self_ref": "#/texts/63",
      "parent": {
-        "$ref": "#/groups/15"
+        "$ref": "#/groups/17"
      },
      "children": [],
      "content_layer": "body",
@ -1374,7 +1408,7 @@
    {
      "self_ref": "#/texts/64",
      "parent": {
-        "$ref": "#/groups/15"
+        "$ref": "#/groups/17"
      },
      "children": [],
      "content_layer": "body",
@ -1386,7 +1420,7 @@
    {
      "self_ref": "#/texts/65",
      "parent": {
-        "$ref": "#/groups/15"
+        "$ref": "#/groups/17"
      },
      "children": [],
      "content_layer": "body",
@ -1398,7 +1432,7 @@
    {
      "self_ref": "#/texts/66",
      "parent": {
-        "$ref": "#/groups/15"
+        "$ref": "#/groups/17"
      },
      "children": [],
      "content_layer": "body",
--- a/tests/data/groundtruth/docling_v2/textbox.docx.md
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.md
@ -19,9 +19,8 @@ show the same suggested reportable symptoms
 Yes
-  A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.
+- A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.
-
+- A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.
   A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.
 **Health Bureau:**
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@ -1,5 +1,7 @@
 from pathlib import Path
 import pytest
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import (
@ -17,6 +19,7 @@ from .verify_utils import verify_document, verify_export
 GENERATE = GEN_TEST_DATA
@pytest.mark.xfail(strict=False)
 def test_textbox_extraction():
    in_path = Path("tests/data/docx/textbox.docx")
    in_doc = InputDocument(
@ -78,8 +81,7 @@ def get_converter():
    return converter
-def test_e2e_docx_conversions():
+def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
    docx_paths = get_docx_paths()
    converter = get_converter()
    for docx_path in docx_paths:
@ -118,6 +120,20 @@ def test_e2e_docx_conversions():
            ), "export to html"
 flaky_path = Path("tests/data/docx/textbox.docx")
 def test_e2e_docx_conversions():
    _test_e2e_docx_conversions_impl(
        docx_paths=[path for path in get_docx_paths() if path != flaky_path]
    )
@pytest.mark.xfail(strict=False)
 def test_textbox_conversion():
    _test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
 def test_text_after_image_anchors():
    """
    Test to analyse whether text gets parsed after image anchors.
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -1,9 +1,10 @@
 from pathlib import Path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
+from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from .test_data_gen_flag import GEN_TEST_DATA
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -3,10 +3,10 @@ from pathlib import Path
 from typing import List, Tuple
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    EasyOcrOptions,
    OcrMacOptions,
    OcrOptions,
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@ -132,6 +132,13 @@ def test_guess_format(tmp_path):
    doc_path = Path("./tests/data/html/wiki_duck.html")
    assert dci._guess_format(doc_path) == InputFormat.HTML
    html_str = (  # HTML starting with a script
        "<script>\nconsole.log('foo');\n</script>"
        '<!doctype html>\n<html lang="en-us class="no-js"></html>'
    )
    stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
    assert dci._guess_format(stream) == InputFormat.HTML
    # Valid MD
    buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
    stream = DocumentStream(name="wiki.md", stream=buf)
--- a/tests/test_options.py
+++ b/tests/test_options.py
@ -7,11 +7,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
    TableFormerMode,
 )
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -323,33 +323,33 @@ def verify_conversion_result_v1(
    if generate:  # only used when re-generating truth
        pages_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(pages_path, "w") as fw:
+        with open(pages_path, mode="w", encoding="utf-8") as fw:
            fw.write(
                json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
            )
        json_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(json_path, "w") as fw:
+        with open(json_path, mode="w", encoding="utf-8") as fw:
            fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
        md_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(md_path, "w") as fw:
+        with open(md_path, mode="w", encoding="utf-8") as fw:
            fw.write(doc_pred_md)
        dt_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(dt_path, "w") as fw:
+        with open(dt_path, mode="w", encoding="utf-8") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test
-        with open(pages_path) as fr:
+        with open(pages_path, encoding="utf-8") as fr:
            doc_true_pages = PageList.validate_json(fr.read())
-        with open(json_path) as fr:
+        with open(json_path, encoding="utf-8") as fr:
            doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
-        with open(md_path) as fr:
+        with open(md_path, encoding="utf-8") as fr:
            doc_true_md = fr.read()
-        with open(dt_path) as fr:
+        with open(dt_path, encoding="utf-8") as fr:
            doc_true_dt = fr.read()
        if not fuzzy:
@ -408,33 +408,33 @@ def verify_conversion_result_v2(
    if generate:  # only used when re-generating truth
        pages_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(pages_path, "w") as fw:
+        with open(pages_path, mode="w", encoding="utf-8") as fw:
            fw.write(
                json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
            )
        json_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(json_path, "w") as fw:
+        with open(json_path, mode="w", encoding="utf-8") as fw:
            fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
        md_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(md_path, "w") as fw:
+        with open(md_path, mode="w", encoding="utf-8") as fw:
            fw.write(doc_pred_md)
        dt_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(dt_path, "w") as fw:
+        with open(dt_path, mode="w", encoding="utf-8") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test
-        with open(pages_path) as fr:
+        with open(pages_path, encoding="utf-8") as fr:
            doc_true_pages = PageList.validate_json(fr.read())
-        with open(json_path) as fr:
+        with open(json_path, encoding="utf-8") as fr:
            doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
-        with open(md_path) as fr:
+        with open(md_path, encoding="utf-8") as fr:
            doc_true_md = fr.read()
-        with open(dt_path) as fr:
+        with open(dt_path, encoding="utf-8") as fr:
            doc_true_dt = fr.read()
        if not fuzzy:
@ -461,12 +461,12 @@ def verify_conversion_result_v2(
 def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
    if not os.path.exists(gtfile) or generate:
-        with open(gtfile, "w") as fw:
+        with open(gtfile, mode="w", encoding="utf-8") as fw:
            json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)
        return True
    else:
-        with open(gtfile) as fr:
+        with open(gtfile, encoding="utf-8") as fr:
            true_doc = DoclingDocument.model_validate_json(fr.read())
        return verify_docitems(pred_doc, true_doc, fuzzy=False)
@ -476,11 +476,11 @@ def verify_export(pred_text: str, gtfile: str, generate: bool = False) -> bool:
    file = Path(gtfile)
    if not file.exists() or generate:
-        with file.open("w") as fw:
+        with file.open(mode="w", encoding="utf-8") as fw:
            fw.write(pred_text)
        return True
-    with file.open("r") as fr:
+    with file.open(encoding="utf-8") as fr:
        true_text = fr.read()
    return pred_text == true_text
--- a/uv.lock
+++ b/uv.lock