add supported_devices

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-06-01 21:12:43 +02:00
parent 5d21153948
commit 7f6df727e3
26 changed files with 81 additions and 106 deletions

View File

@ -22,6 +22,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
@ -30,8 +31,6 @@ from docling.datamodel.base_models import (
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
OcrOptions,
PaginatedPipelineOptions,

View File

@ -1,6 +1,4 @@
import logging
import os
import re
from enum import Enum
from pathlib import Path
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@ -10,13 +8,11 @@ from pydantic import (
BaseModel,
ConfigDict,
Field,
field_validator,
model_validator,
)
from pydantic_settings import BaseSettings, SettingsConfigDict
from typing_extensions import deprecated
# Import the following for backwards compatibility
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
@ -34,64 +30,6 @@ from docling.datamodel.vlm_model_spec import (
_log = logging.getLogger(__name__)
class AcceleratorDevice(str, Enum):
"""Devices to run model inference"""
AUTO = "auto"
CPU = "cpu"
CUDA = "cuda"
MPS = "mps"
class AcceleratorOptions(BaseSettings):
model_config = SettingsConfigDict(
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
)
num_threads: int = 4
device: Union[str, AcceleratorDevice] = "auto"
cuda_use_flash_attention2: bool = False
@field_validator("device")
def validate_device(cls, value):
# "auto", "cpu", "cuda", "mps", or "cuda:N"
if value in {d.value for d in AcceleratorDevice} or re.match(
r"^cuda(:\d+)?$", value
):
return value
raise ValueError(
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
)
@model_validator(mode="before")
@classmethod
def check_alternative_envvars(cls, data: Any) -> Any:
r"""
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
The alternative envvar is used only if it is valid and the regular envvar is not set.
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
the same functionality. In case the alias envvar is set and the user tries to override the
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
as an extra input instead of simply overwriting the evvar value for that parameter.
"""
if isinstance(data, dict):
input_num_threads = data.get("num_threads")
# Check if to set the num_threads from the alternative envvar
if input_num_threads is None:
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
omp_num_threads = os.getenv("OMP_NUM_THREADS")
if docling_num_threads is None and omp_num_threads is not None:
try:
data["num_threads"] = int(omp_num_threads)
except ValueError:
_log.error(
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
omp_num_threads,
)
return data
class BaseOptions(BaseModel):
"""Base class for options."""

View File

@ -1,9 +1,11 @@
from enum import Enum
from typing import Any, Dict, Literal
from typing import Any, Dict, List, Literal
from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated
from docling.datamodel.accelerator_options import AcceleratorDevice
class BaseVlmOptions(BaseModel):
kind: str
@ -35,6 +37,12 @@ class InlineVlmOptions(BaseVlmOptions):
inference_framework: InferenceFramework
response_format: ResponseFormat
supported_devices: List[AcceleratorDevice] = [
AcceleratorDevice.CPU,
AcceleratorDevice.CUDA,
AcceleratorDevice.MPS,
]
scale: float = 2.0
temperature: float = 0.0

View File

@ -5,6 +5,7 @@ from pydantic import (
AnyUrl,
)
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
@ -21,6 +22,7 @@ SMOLDOCLING_MLX = InlineVlmOptions(
prompt="Convert this page to docling.",
response_format=ResponseFormat.DOCTAGS,
inference_framework=InferenceFramework.MLX,
supported_devices=[AcceleratorDevice.MPS],
scale=2.0,
temperature=0.0,
)
@ -30,6 +32,11 @@ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
prompt="Convert this page to docling.",
response_format=ResponseFormat.DOCTAGS,
inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
supported_devices=[
AcceleratorDevice.CPU,
AcceleratorDevice.CUDA,
AcceleratorDevice.MPS,
],
scale=2.0,
temperature=0.0,
)
@ -40,6 +47,7 @@ GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
scale=2.0,
temperature=0.0,
)
@ -60,6 +68,7 @@ PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions(
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS_VISION2SEQ,
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
scale=2.0,
temperature=0.0,
)
@ -69,6 +78,7 @@ PIXTRAL_12B_MLX = InlineVlmOptions(
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.MLX,
supported_devices=[AcceleratorDevice.MPS],
scale=2.0,
temperature=0.0,
)
@ -77,8 +87,10 @@ PIXTRAL_12B_MLX = InlineVlmOptions(
PHI4_TRANSFORMERS = InlineVlmOptions(
repo_id="microsoft/Phi-4-multimodal-instruct",
prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
trust_remote_code=True,
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS_CAUSALLM,
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
scale=2.0,
temperature=0.0,
)
@ -89,6 +101,7 @@ QWEN25_VL_3B_MLX = InlineVlmOptions(
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.MLX,
supported_devices=[AcceleratorDevice.MPS],
scale=2.0,
temperature=0.0,
)
@ -99,6 +112,7 @@ GEMMA3_12B_MLX = InlineVlmOptions(
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.MLX,
supported_devices=[AcceleratorDevice.MPS],
scale=2.0,
temperature=0.0,
)
@ -108,6 +122,7 @@ GEMMA3_27B_MLX = InlineVlmOptions(
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.MLX,
supported_devices=[AcceleratorDevice.MPS],
scale=2.0,
temperature=0.0,
)

View File

@ -11,9 +11,10 @@ from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BaseModelWithOptions, BasePageModel

View File

@ -16,8 +16,8 @@ from docling_core.types.doc.labels import CodeLanguageLabel
from PIL import Image, ImageOps
from pydantic import BaseModel
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
from docling.datamodel.pipeline_options import AcceleratorOptions
from docling.models.base_model import BaseItemAndImageEnrichmentModel
from docling.utils.accelerator_utils import decide_device

View File

@ -13,7 +13,7 @@ from docling_core.types.doc import (
from PIL import Image
from pydantic import BaseModel
from docling.datamodel.pipeline_options import AcceleratorOptions
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.models.base_model import BaseEnrichmentModel
from docling.utils.accelerator_utils import decide_device

View File

@ -9,11 +9,10 @@ import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
OcrOptions,
)

View File

@ -10,9 +10,9 @@ from docling_core.types.doc import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import Image
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device

View File

@ -8,10 +8,10 @@ from typing import Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
OcrMacOptions,
OcrOptions,
)

View File

@ -5,8 +5,8 @@ from typing import Optional, Type, Union
from PIL import Image
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
PictureDescriptionApiOptions,
PictureDescriptionBaseOptions,
)

View File

@ -13,8 +13,8 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
)
from PIL import Image
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
PictureDescriptionBaseOptions,
)
from docling.models.base_model import (

View File

@ -4,8 +4,8 @@ from typing import Optional, Type, Union
from PIL import Image
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
PictureDescriptionBaseOptions,
PictureDescriptionVlmOptions,
)

View File

@ -7,11 +7,10 @@ import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
OcrOptions,
RapidOcrOptions,
)

View File

@ -13,11 +13,10 @@ from docling_core.types.doc.page import (
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
TableFormerMode,
TableStructureOptions,
)

View File

@ -13,10 +13,10 @@ import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
OcrOptions,
TesseractCliOcrOptions,
)

View File

@ -7,10 +7,10 @@ from typing import Iterable, Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
OcrOptions,
TesseractOcrOptions,
)

View File

@ -4,11 +4,11 @@ from collections.abc import Iterable
from pathlib import Path
from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
from docling.datamodel.accelerator_options import (
AcceleratorOptions,
)
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
from docling.models.base_model import BasePageModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel
@ -39,8 +39,10 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
GenerationConfig,
)
self.device = decide_device(accelerator_options.device)
self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
self.device = decide_device(
accelerator_options.device,
supported_devices=vlm_options.supported_devices,
)
_log.debug(f"Available device for VLM: {self.device}")
self.use_cache = vlm_options.use_kv_cache

View File

@ -4,11 +4,11 @@ from collections.abc import Iterable
from pathlib import Path
from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
from docling.datamodel.accelerator_options import (
AcceleratorOptions,
)
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
from docling.models.base_model import BasePageModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel
@ -38,9 +38,11 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
BitsAndBytesConfig,
)
self.device = decide_device(accelerator_options.device)
self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
_log.debug(f"Available device for HuggingFace VLM: {self.device}")
self.device = decide_device(
accelerator_options.device,
supported_devices=vlm_options.supported_devices,
)
_log.debug(f"Available device for VLM: {self.device}")
self.use_cache = vlm_options.use_kv_cache
self.max_new_tokens = vlm_options.max_new_tokens

View File

@ -4,11 +4,11 @@ from collections.abc import Iterable
from pathlib import Path
from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
from docling.datamodel.accelerator_options import (
AcceleratorOptions,
)
from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
from docling.models.base_model import BasePageModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel

View File

@ -1,13 +1,16 @@
import logging
from typing import List, Optional
import torch
from docling.datamodel.pipeline_options import AcceleratorDevice
from docling.datamodel.accelerator_options import AcceleratorDevice
_log = logging.getLogger(__name__)
def decide_device(accelerator_device: str) -> str:
def decide_device(
accelerator_device: str, supported_devices: Optional[List[AcceleratorDevice]] = None
) -> str:
r"""
Resolve the device based on the acceleration options and the available devices in the system.
@ -20,6 +23,18 @@ def decide_device(accelerator_device: str) -> str:
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
if supported_devices is not None:
if has_cuda and AcceleratorDevice.CUDA not in supported_devices:
_log.info(
f"Removing CUDA from available devices because it is not in {supported_devices=}"
)
has_cuda = False
if has_mps and AcceleratorDevice.MPS not in supported_devices:
_log.info(
f"Removing MPS from available devices because it is not in {supported_devices=}"
)
has_mps = False
if accelerator_device == AcceleratorDevice.AUTO.value: # Handle 'auto'
if has_cuda:
device = "cuda:0"

View File

@ -3,10 +3,9 @@ import logging
import time
from pathlib import Path
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -1,9 +1,8 @@
from pathlib import Path
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
)
from docling.datamodel.settings import settings

View File

@ -1,9 +1,10 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from .test_data_gen_flag import GEN_TEST_DATA

View File

@ -3,10 +3,10 @@ from pathlib import Path
from typing import List, Tuple
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
EasyOcrOptions,
OcrMacOptions,
OcrOptions,

View File

@ -7,11 +7,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TableFormerMode,
)