mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: Support for Python 3.14 (#2530)
* fix dependencies for py314 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add metadata and CI tests Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add back gliner Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update error message about python 3.14 availability Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * skip tests which cannot run on py 3.14 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix lint Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove vllm from py 3.14 deps Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * safe import for vllm Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update lock Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove torch.compile() Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update checkbox results after docling-core changes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * cannot run mlx example in CI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add test for rapidocr backends and skip onnxruntime on py3.14 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix other occurances of torch.compile() Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * allow torch.compile for Python <3.14. proper support will be introduced with new torch releases Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
8
.github/workflows/checks.yml
vendored
8
.github/workflows/checks.yml
vendored
@@ -20,7 +20,7 @@ env:
|
|||||||
tests/test_asr_pipeline.py
|
tests/test_asr_pipeline.py
|
||||||
tests/test_threaded_pipeline.py
|
tests/test_threaded_pipeline.py
|
||||||
PYTEST_TO_SKIP: |-
|
PYTEST_TO_SKIP: |-
|
||||||
EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping)\.py$'
|
EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example)\.py$'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
lint:
|
lint:
|
||||||
@@ -62,7 +62,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
|
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14']
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v5
|
||||||
|
|
||||||
@@ -129,7 +129,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
|
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14']
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v5
|
||||||
|
|
||||||
@@ -201,7 +201,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
|
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14']
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v5
|
||||||
|
|
||||||
|
|||||||
@@ -738,10 +738,15 @@ def convert( # noqa: C901
|
|||||||
|
|
||||||
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
||||||
except ImportError:
|
except ImportError:
|
||||||
_log.warning(
|
if sys.version_info < (3, 14):
|
||||||
"To run SmolDocling faster, please install mlx-vlm:\n"
|
_log.warning(
|
||||||
"pip install mlx-vlm"
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
||||||
)
|
"pip install mlx-vlm"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_log.warning(
|
||||||
|
"You can run SmolDocling faster with MLX support, but it is unfortunately not yet available on Python 3.14."
|
||||||
|
)
|
||||||
|
|
||||||
elif vlm_model == VlmModelType.GRANITEDOCLING:
|
elif vlm_model == VlmModelType.GRANITEDOCLING:
|
||||||
pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
|
pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
|
||||||
@@ -751,10 +756,16 @@ def convert( # noqa: C901
|
|||||||
|
|
||||||
pipeline_options.vlm_options = GRANITEDOCLING_MLX
|
pipeline_options.vlm_options = GRANITEDOCLING_MLX
|
||||||
except ImportError:
|
except ImportError:
|
||||||
_log.warning(
|
if sys.version_info < (3, 14):
|
||||||
"To run GraniteDocling faster, please install mlx-vlm:\n"
|
_log.warning(
|
||||||
"pip install mlx-vlm"
|
"To run GraniteDocling faster, please install mlx-vlm:\n"
|
||||||
)
|
"pip install mlx-vlm"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_log.warning(
|
||||||
|
"You can run GraniteDocling faster with MLX support, but it is unfortunately not yet available on Python 3.14."
|
||||||
|
)
|
||||||
|
|
||||||
elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
|
elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
|
||||||
pipeline_options.vlm_options = SMOLDOCLING_VLLM
|
pipeline_options.vlm_options = SMOLDOCLING_VLLM
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import sys
|
||||||
import threading
|
import threading
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -75,7 +76,10 @@ class PictureDescriptionVlmModel(
|
|||||||
else "sdpa"
|
else "sdpa"
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
self.model = torch.compile(self.model) # type: ignore
|
if sys.version_info < (3, 14):
|
||||||
|
self.model = torch.compile(self.model) # type: ignore
|
||||||
|
else:
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
self.provenance = f"{self.options.repo_id}"
|
self.provenance = f"{self.options.repo_id}"
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import importlib.metadata
|
import importlib.metadata
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -129,7 +130,10 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
|
|||||||
trust_remote_code=vlm_options.trust_remote_code,
|
trust_remote_code=vlm_options.trust_remote_code,
|
||||||
revision=vlm_options.revision,
|
revision=vlm_options.revision,
|
||||||
)
|
)
|
||||||
self.vlm_model = torch.compile(self.vlm_model) # type: ignore
|
if sys.version_info < (3, 14):
|
||||||
|
self.vlm_model = torch.compile(self.vlm_model) # type: ignore
|
||||||
|
else:
|
||||||
|
self.vlm_model.eval()
|
||||||
|
|
||||||
# Load generation config
|
# Load generation config
|
||||||
self.generation_config = GenerationConfig.from_pretrained(
|
self.generation_config = GenerationConfig.from_pretrained(
|
||||||
|
|||||||
@@ -50,9 +50,14 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|||||||
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
||||||
from mlx_vlm.utils import load_config # type: ignore
|
from mlx_vlm.utils import load_config # type: ignore
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
if sys.version_info < (3, 14):
|
||||||
"mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
|
raise ImportError(
|
||||||
)
|
"mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ImportError(
|
||||||
|
"mlx-vlm is not installed. It is not yet available on Python 3.14."
|
||||||
|
)
|
||||||
|
|
||||||
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -153,7 +154,10 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
|
|||||||
),
|
),
|
||||||
trust_remote_code=vlm_options.trust_remote_code,
|
trust_remote_code=vlm_options.trust_remote_code,
|
||||||
)
|
)
|
||||||
self.vlm_model = torch.compile(self.vlm_model) # type: ignore
|
if sys.version_info < (3, 14):
|
||||||
|
self.vlm_model = torch.compile(self.vlm_model) # type: ignore
|
||||||
|
else:
|
||||||
|
self.vlm_model.eval()
|
||||||
|
|
||||||
# Load generation config
|
# Load generation config
|
||||||
self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
|
self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -100,7 +101,18 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|||||||
return
|
return
|
||||||
|
|
||||||
from transformers import AutoProcessor
|
from transformers import AutoProcessor
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
try:
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
except ImportError:
|
||||||
|
if sys.version_info < (3, 14):
|
||||||
|
raise ImportError(
|
||||||
|
"vllm is not installed. Please install it via `pip install vllm`."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ImportError(
|
||||||
|
"vllm is not installed. It is not yet available on Python 3.14."
|
||||||
|
)
|
||||||
|
|
||||||
# Device selection
|
# Device selection
|
||||||
self.device = decide_device(
|
self.device = decide_device(
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -117,9 +118,15 @@ class _NativeWhisperModel:
|
|||||||
try:
|
try:
|
||||||
import whisper # type: ignore
|
import whisper # type: ignore
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
if sys.version_info < (3, 14):
|
||||||
"whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
|
raise ImportError(
|
||||||
)
|
"whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ImportError(
|
||||||
|
"whisper is not installed. Unfortunately its dependencies are not yet available for Python 3.14."
|
||||||
|
)
|
||||||
|
|
||||||
self.asr_options = asr_options
|
self.asr_options = asr_options
|
||||||
self.max_tokens = asr_options.max_new_tokens
|
self.max_tokens = asr_options.max_new_tokens
|
||||||
self.temperature = asr_options.temperature
|
self.temperature = asr_options.temperature
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ classifiers = [
|
|||||||
"Programming Language :: Python :: 3.11",
|
"Programming Language :: Python :: 3.11",
|
||||||
"Programming Language :: Python :: 3.12",
|
"Programming Language :: Python :: 3.12",
|
||||||
"Programming Language :: Python :: 3.13",
|
"Programming Language :: Python :: 3.13",
|
||||||
|
"Programming Language :: Python :: 3.14",
|
||||||
]
|
]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
authors = [
|
authors = [
|
||||||
@@ -63,7 +64,7 @@ dependencies = [
|
|||||||
'pandas (>=2.1.4,<3.0.0)',
|
'pandas (>=2.1.4,<3.0.0)',
|
||||||
'marko (>=2.1.2,<3.0.0)',
|
'marko (>=2.1.2,<3.0.0)',
|
||||||
'openpyxl (>=3.1.5,<4.0.0)',
|
'openpyxl (>=3.1.5,<4.0.0)',
|
||||||
'lxml (>=4.0.0,<6.0.0)',
|
'lxml (>=4.0.0,<7.0.0)',
|
||||||
'pillow (>=10.0.0,<12.0.0)',
|
'pillow (>=10.0.0,<12.0.0)',
|
||||||
'tqdm (>=4.65.0,<5.0.0)',
|
'tqdm (>=4.65.0,<5.0.0)',
|
||||||
'pluggy (>=1.0.0,<2.0.0)',
|
'pluggy (>=1.0.0,<2.0.0)',
|
||||||
@@ -95,19 +96,19 @@ ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"']
|
|||||||
vlm = [
|
vlm = [
|
||||||
'transformers (>=4.46.0,<5.0.0)',
|
'transformers (>=4.46.0,<5.0.0)',
|
||||||
'accelerate (>=1.2.1,<2.0.0)',
|
'accelerate (>=1.2.1,<2.0.0)',
|
||||||
'mlx-vlm (>=0.3.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
|
'mlx-vlm (>=0.3.0,<1.0.0) ; python_version >= "3.10" and python_version < "3.14" and sys_platform == "darwin" and platform_machine == "arm64"',
|
||||||
'vllm (>=0.10.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64"',
|
'vllm (>=0.10.0,<1.0.0) ; python_version >= "3.10" and python_version < "3.14" and sys_platform == "linux" and platform_machine == "x86_64"',
|
||||||
"qwen-vl-utils>=0.0.11",
|
"qwen-vl-utils>=0.0.11",
|
||||||
]
|
]
|
||||||
rapidocr = [
|
rapidocr = [
|
||||||
'rapidocr (>=3.3,<4.0.0) ; python_version < "3.14"',
|
'rapidocr (>=3.3,<4.0.0)',
|
||||||
'onnxruntime (>=1.7.0,<2.0.0)',
|
'onnxruntime (>=1.7.0,<2.0.0) ; python_version < "3.14"',
|
||||||
# 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
|
# 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
|
||||||
# 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
|
# 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
|
||||||
]
|
]
|
||||||
asr = [
|
asr = [
|
||||||
'mlx-whisper>=0.4.3 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
|
'mlx-whisper>=0.4.3 ; python_version >= "3.10" and python_version < "3.14" and sys_platform == "darwin" and platform_machine == "arm64"',
|
||||||
"openai-whisper>=20250625",
|
'openai-whisper>=20250625 ; python_version < "3.14"',
|
||||||
]
|
]
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
@@ -146,10 +147,10 @@ examples = [
|
|||||||
"langchain-milvus~=0.1",
|
"langchain-milvus~=0.1",
|
||||||
"langchain-text-splitters~=0.2",
|
"langchain-text-splitters~=0.2",
|
||||||
"modelscope>=1.29.0",
|
"modelscope>=1.29.0",
|
||||||
"gliner>=0.2.21",
|
'gliner>=0.2.21 ; python_version < "3.14"', # gliner depends on onnxruntime which is not available on py3.14
|
||||||
]
|
]
|
||||||
constraints = [
|
constraints = [
|
||||||
'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
|
'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10" and python_version < "3.14"',
|
||||||
'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
|
'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -16,9 +16,9 @@
|
|||||||
|
|
||||||
استاندارد اجباری است؟
|
استاندارد اجباری است؟
|
||||||
|
|
||||||
بلی
|
- [ ] بلی
|
||||||
|
|
||||||
خير
|
- [x] خير
|
||||||
|
|
||||||
مرجع صادرکننده استاندارد
|
مرجع صادرکننده استاندارد
|
||||||
|
|
||||||
@@ -26,7 +26,7 @@
|
|||||||
|
|
||||||
آيا توليدکننده محصول، استاندارد مذکور را اخذ نموده است؟
|
آيا توليدکننده محصول، استاندارد مذکور را اخذ نموده است؟
|
||||||
|
|
||||||
بلی خير
|
- [x] بلی خير
|
||||||
|
|
||||||
## -3 پذيرش در بورس
|
## -3 پذيرش در بورس
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import Mock, patch
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
@@ -10,6 +11,11 @@ from docling.datamodel.pipeline_options import AsrPipelineOptions
|
|||||||
from docling.document_converter import AudioFormatOption, DocumentConverter
|
from docling.document_converter import AudioFormatOption, DocumentConverter
|
||||||
from docling.pipeline.asr_pipeline import AsrPipeline
|
from docling.pipeline.asr_pipeline import AsrPipeline
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.skipif(
|
||||||
|
sys.version_info >= (3, 14),
|
||||||
|
reason="Python 3.14 is not yet supported by whisper dependencies.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_audio_path():
|
def test_audio_path():
|
||||||
|
|||||||
@@ -70,13 +70,19 @@ def test_e2e_conversions():
|
|||||||
(EasyOcrOptions(force_full_page_ocr=True), False),
|
(EasyOcrOptions(force_full_page_ocr=True), False),
|
||||||
]
|
]
|
||||||
|
|
||||||
# rapidocr is only available for Python >=3.6,<3.13
|
for rapidocr_backend in ["onnxruntime", "torch"]:
|
||||||
if sys.version_info < (3, 13):
|
if sys.version_info >= (3, 14) and rapidocr_backend == "onnxruntime":
|
||||||
engines.append((RapidOcrOptions(), False))
|
# skip onnxruntime backend on Python 3.14
|
||||||
engines.append((RapidOcrOptions(force_full_page_ocr=True), False))
|
continue
|
||||||
|
|
||||||
|
engines.append((RapidOcrOptions(backend=rapidocr_backend), False))
|
||||||
|
engines.append(
|
||||||
|
(RapidOcrOptions(backend=rapidocr_backend, force_full_page_ocr=True), False)
|
||||||
|
)
|
||||||
engines.append(
|
engines.append(
|
||||||
(
|
(
|
||||||
RapidOcrOptions(
|
RapidOcrOptions(
|
||||||
|
backend=rapidocr_backend,
|
||||||
force_full_page_ocr=True,
|
force_full_page_ocr=True,
|
||||||
rec_font_path="test",
|
rec_font_path="test",
|
||||||
rapidocr_params={"Rec.font_path": None}, # overwrites rec_font_path
|
rapidocr_params={"Rec.font_path": None}, # overwrites rec_font_path
|
||||||
|
|||||||
Reference in New Issue
Block a user