feat: Support for Python 3.14 (#2530)

* fix dependencies for py314 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add metadata and CI tests Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add back gliner Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update error message about python 3.14 availability Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * skip tests which cannot run on py 3.14 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix lint Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove vllm from py 3.14 deps Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * safe import for vllm Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update lock Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove torch.compile() Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update checkbox results after docling-core changes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * cannot run mlx example in CI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add test for rapidocr backends and skip onnxruntime on py3.14 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix other occurances of torch.compile() Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * allow torch.compile for Python <3.14. proper support will be introduced with new torch releases Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-10-28 14:32:15 +01:00
parent 9a6fdf936b
commit cdffb47b9a
13 changed files with 2278 additions and 1361 deletions
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -20,7 +20,7 @@ env:
    tests/test_asr_pipeline.py
    tests/test_threaded_pipeline.py
  PYTEST_TO_SKIP: |-
-  EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping)\.py$'
+  EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example)\.py$'

 jobs:
  lint:
@@ -62,7 +62,7 @@ jobs:
      strategy:
        fail-fast: false
        matrix:
-          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
+          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14']
      steps:
        - uses: actions/checkout@v5

@@ -129,7 +129,7 @@ jobs:
      strategy:
        fail-fast: false
        matrix:
-          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
+          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14']
      steps:
        - uses: actions/checkout@v5

@@ -201,7 +201,7 @@ jobs:
      strategy:
        fail-fast: false
        matrix:
-          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
+          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14']
      steps:
        - uses: actions/checkout@v5

--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -738,10 +738,15 @@ def convert(  # noqa: C901

                        pipeline_options.vlm_options = SMOLDOCLING_MLX
                    except ImportError:
-                        _log.warning(
-                            "To run SmolDocling faster, please install mlx-vlm:\n"
-                            "pip install mlx-vlm"
-                        )
+                        if sys.version_info < (3, 14):
+                            _log.warning(
+                                "To run SmolDocling faster, please install mlx-vlm:\n"
+                                "pip install mlx-vlm"
+                            )
+                        else:
+                            _log.warning(
+                                "You can run SmolDocling faster with MLX support, but it is unfortunately not yet available on Python 3.14."
+                            )

            elif vlm_model == VlmModelType.GRANITEDOCLING:
                pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
@@ -751,10 +756,16 @@ def convert(  # noqa: C901

                        pipeline_options.vlm_options = GRANITEDOCLING_MLX
                    except ImportError:
-                        _log.warning(
-                            "To run GraniteDocling faster, please install mlx-vlm:\n"
-                            "pip install mlx-vlm"
-                        )
+                        if sys.version_info < (3, 14):
+                            _log.warning(
+                                "To run GraniteDocling faster, please install mlx-vlm:\n"
+                                "pip install mlx-vlm"
+                            )
+                        else:
+                            _log.warning(
+                                "You can run GraniteDocling faster with MLX support, but it is unfortunately not yet available on Python 3.14."
+                            )
+
            elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
                pipeline_options.vlm_options = SMOLDOCLING_VLLM

--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@@ -1,3 +1,4 @@
+import sys
 import threading
 from collections.abc import Iterable
 from pathlib import Path
@@ -75,7 +76,10 @@ class PictureDescriptionVlmModel(
                        else "sdpa"
                    ),
                )
-                self.model = torch.compile(self.model)  # type: ignore
+                if sys.version_info < (3, 14):
+                    self.model = torch.compile(self.model)  # type: ignore
+                else:
+                    self.model.eval()

            self.provenance = f"{self.options.repo_id}"

--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -1,5 +1,6 @@
 import importlib.metadata
 import logging
+import sys
 import time
 from collections.abc import Iterable
 from pathlib import Path
@@ -129,7 +130,10 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
                trust_remote_code=vlm_options.trust_remote_code,
                revision=vlm_options.revision,
            )
-            self.vlm_model = torch.compile(self.vlm_model)  # type: ignore
+            if sys.version_info < (3, 14):
+                self.vlm_model = torch.compile(self.vlm_model)  # type: ignore
+            else:
+                self.vlm_model.eval()

            # Load generation config
            self.generation_config = GenerationConfig.from_pretrained(
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@@ -50,9 +50,14 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
                from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
                from mlx_vlm.utils import load_config  # type: ignore
            except ImportError:
-                raise ImportError(
-                    "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
-                )
+                if sys.version_info < (3, 14):
+                    raise ImportError(
+                        "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
+                    )
+                else:
+                    raise ImportError(
+                        "mlx-vlm is not installed. It is not yet available on Python 3.14."
+                    )

            repo_cache_folder = vlm_options.repo_id.replace("/", "--")

--- a/docling/models/vlm_models_inline/nuextract_transformers_model.py
+++ b/docling/models/vlm_models_inline/nuextract_transformers_model.py
@@ -1,4 +1,5 @@
 import logging
+import sys
 import time
 from collections.abc import Iterable
 from pathlib import Path
@@ -153,7 +154,10 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
                ),
                trust_remote_code=vlm_options.trust_remote_code,
            )
-            self.vlm_model = torch.compile(self.vlm_model)  # type: ignore
+            if sys.version_info < (3, 14):
+                self.vlm_model = torch.compile(self.vlm_model)  # type: ignore
+            else:
+                self.vlm_model.eval()

            # Load generation config
            self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
--- a/docling/models/vlm_models_inline/vllm_model.py
+++ b/docling/models/vlm_models_inline/vllm_model.py
@@ -1,4 +1,5 @@
 import logging
+import sys
 import time
 from collections.abc import Iterable
 from pathlib import Path
@@ -100,7 +101,18 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
            return

        from transformers import AutoProcessor
-        from vllm import LLM, SamplingParams
+
+        try:
+            from vllm import LLM, SamplingParams
+        except ImportError:
+            if sys.version_info < (3, 14):
+                raise ImportError(
+                    "vllm is not installed. Please install it via `pip install vllm`."
+                )
+            else:
+                raise ImportError(
+                    "vllm is not installed. It is not yet available on Python 3.14."
+                )

        # Device selection
        self.device = decide_device(
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@@ -1,6 +1,7 @@
 import logging
 import os
 import re
+import sys
 import tempfile
 from io import BytesIO
 from pathlib import Path
@@ -117,9 +118,15 @@ class _NativeWhisperModel:
            try:
                import whisper  # type: ignore
            except ImportError:
-                raise ImportError(
-                    "whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
-                )
+                if sys.version_info < (3, 14):
+                    raise ImportError(
+                        "whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
+                    )
+                else:
+                    raise ImportError(
+                        "whisper is not installed. Unfortunately its dependencies are not yet available for Python 3.14."
+                    )
+
            self.asr_options = asr_options
            self.max_tokens = asr_options.max_new_tokens
            self.temperature = asr_options.temperature
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ classifiers = [
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12",
  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
 ]
 readme = "README.md"
 authors = [
@@ -63,7 +64,7 @@ dependencies = [
  'pandas (>=2.1.4,<3.0.0)',
  'marko (>=2.1.2,<3.0.0)',
  'openpyxl (>=3.1.5,<4.0.0)',
-  'lxml (>=4.0.0,<6.0.0)',
+  'lxml (>=4.0.0,<7.0.0)',
  'pillow (>=10.0.0,<12.0.0)',
  'tqdm (>=4.65.0,<5.0.0)',
  'pluggy (>=1.0.0,<2.0.0)',
@@ -95,19 +96,19 @@ ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"']
 vlm = [
  'transformers (>=4.46.0,<5.0.0)',
  'accelerate (>=1.2.1,<2.0.0)',
-  'mlx-vlm (>=0.3.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
-  'vllm (>=0.10.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64"',
+  'mlx-vlm (>=0.3.0,<1.0.0) ; python_version >= "3.10" and python_version < "3.14" and sys_platform == "darwin" and platform_machine == "arm64"',
+  'vllm (>=0.10.0,<1.0.0) ; python_version >= "3.10" and python_version < "3.14" and sys_platform == "linux" and platform_machine == "x86_64"',
  "qwen-vl-utils>=0.0.11",
 ]
 rapidocr = [
-  'rapidocr (>=3.3,<4.0.0) ; python_version < "3.14"',
-  'onnxruntime (>=1.7.0,<2.0.0)',
+  'rapidocr (>=3.3,<4.0.0)',
+  'onnxruntime (>=1.7.0,<2.0.0) ; python_version < "3.14"',
  # 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
  # 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
 ]
 asr = [
-    'mlx-whisper>=0.4.3 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
-    "openai-whisper>=20250625",
+    'mlx-whisper>=0.4.3 ; python_version >= "3.10" and python_version < "3.14" and sys_platform == "darwin" and platform_machine == "arm64"',
+    'openai-whisper>=20250625 ; python_version < "3.14"',
 ]

 [dependency-groups]
@@ -146,10 +147,10 @@ examples = [
  "langchain-milvus~=0.1",
  "langchain-text-splitters~=0.2",
  "modelscope>=1.29.0",
-  "gliner>=0.2.21",
+  'gliner>=0.2.21 ; python_version < "3.14"',  # gliner depends on onnxruntime which is not available on py3.14
 ]
 constraints = [
-  'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
+  'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10" and python_version < "3.14"',
  'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
 ]

--- a/tests/data/groundtruth/docling_v2/right_to_left_03.md
+++ b/tests/data/groundtruth/docling_v2/right_to_left_03.md
@@ -16,9 +16,9 @@

 استاندارد اجباری است؟

-بلی
+- [ ] بلی

-خير
+- [x] خير

 مرجع صادرکننده استاندارد

@@ -26,7 +26,7 @@

 آيا توليدکننده محصول، استاندارد مذکور را اخذ نموده است؟

-بلی        خير
+- [x] بلی        خير

 ## -3 پذيرش در بورس

--- a/tests/test_asr_pipeline.py
+++ b/tests/test_asr_pipeline.py
@@ -1,3 +1,4 @@
+import sys
 from pathlib import Path
 from unittest.mock import Mock, patch

@@ -10,6 +11,11 @@ from docling.datamodel.pipeline_options import AsrPipelineOptions
 from docling.document_converter import AudioFormatOption, DocumentConverter
 from docling.pipeline.asr_pipeline import AsrPipeline

+pytestmark = pytest.mark.skipif(
+    sys.version_info >= (3, 14),
+    reason="Python 3.14 is not yet supported by whisper dependencies.",
+)
+

@pytest.fixture
 def test_audio_path():
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -70,13 +70,19 @@ def test_e2e_conversions():
        (EasyOcrOptions(force_full_page_ocr=True), False),
    ]

-    # rapidocr is only available for Python >=3.6,<3.13
-    if sys.version_info < (3, 13):
-        engines.append((RapidOcrOptions(), False))
-        engines.append((RapidOcrOptions(force_full_page_ocr=True), False))
+    for rapidocr_backend in ["onnxruntime", "torch"]:
+        if sys.version_info >= (3, 14) and rapidocr_backend == "onnxruntime":
+            # skip onnxruntime backend on Python 3.14
+            continue
+
+        engines.append((RapidOcrOptions(backend=rapidocr_backend), False))
+        engines.append(
+            (RapidOcrOptions(backend=rapidocr_backend, force_full_page_ocr=True), False)
+        )
        engines.append(
            (
                RapidOcrOptions(
+                    backend=rapidocr_backend,
                    force_full_page_ocr=True,
                    rec_font_path="test",
                    rapidocr_params={"Rec.font_path": None},  # overwrites rec_font_path
--- a/uv.lock
+++ b/uv.lock