mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
refactor instances of VLM models
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
fb0d979419
commit
0b2c1d5eda
@ -29,13 +29,6 @@ from docling.datamodel.base_models import (
|
|||||||
OutputFormat,
|
OutputFormat,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_model_specializations import (
|
|
||||||
VlmModelType,
|
|
||||||
granite_vision_vlm_conversion_options,
|
|
||||||
granite_vision_vlm_ollama_conversion_options,
|
|
||||||
smoldocling_vlm_conversion_options,
|
|
||||||
smoldocling_vlm_mlx_conversion_options,
|
|
||||||
)
|
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
@ -48,6 +41,13 @@ from docling.datamodel.pipeline_options import (
|
|||||||
TableFormerMode,
|
TableFormerMode,
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_vlm_model_spec import (
|
||||||
|
GRANITE_VISION_OLLAMA,
|
||||||
|
GRANITE_VISION_TRANSFORMERS,
|
||||||
|
SMOLDOCLING_MLX,
|
||||||
|
SMOLDOCLING_TRANSFORMERS,
|
||||||
|
VlmModelType,
|
||||||
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
from docling.models.factories import get_ocr_factory
|
from docling.models.factories import get_ocr_factory
|
||||||
@ -549,20 +549,16 @@ def convert( # noqa: C901
|
|||||||
)
|
)
|
||||||
|
|
||||||
if vlm_model == VlmModelType.GRANITE_VISION:
|
if vlm_model == VlmModelType.GRANITE_VISION:
|
||||||
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
||||||
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
||||||
pipeline_options.vlm_options = (
|
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
||||||
granite_vision_vlm_ollama_conversion_options
|
|
||||||
)
|
|
||||||
elif vlm_model == VlmModelType.SMOLDOCLING:
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
||||||
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
||||||
if sys.platform == "darwin":
|
if sys.platform == "darwin":
|
||||||
try:
|
try:
|
||||||
import mlx_vlm
|
import mlx_vlm
|
||||||
|
|
||||||
pipeline_options.vlm_options = (
|
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
||||||
smoldocling_vlm_mlx_conversion_options
|
|
||||||
)
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
_log.warning(
|
_log.warning(
|
||||||
"To run SmolDocling faster, please install mlx-vlm:\n"
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
||||||
|
@ -16,10 +16,13 @@ from pydantic import (
|
|||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from docling.datamodel.pipeline_model_specializations import (
|
# Import the following for backwards compatibility
|
||||||
ApiVlmOptions,
|
from docling.datamodel.pipeline_vlm_model_spec import (
|
||||||
HuggingFaceVlmOptions,
|
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
||||||
smoldocling_vlm_conversion_options,
|
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
||||||
|
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
|
||||||
|
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
|
||||||
|
VlmModelType,
|
||||||
)
|
)
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -302,6 +305,65 @@ class PaginatedPipelineOptions(PipelineOptions):
|
|||||||
generate_picture_images: bool = False
|
generate_picture_images: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class BaseVlmOptions(BaseModel):
|
||||||
|
kind: str
|
||||||
|
prompt: str
|
||||||
|
|
||||||
|
|
||||||
|
class ResponseFormat(str, Enum):
|
||||||
|
DOCTAGS = "doctags"
|
||||||
|
MARKDOWN = "markdown"
|
||||||
|
HTML = "html"
|
||||||
|
|
||||||
|
|
||||||
|
class InferenceFramework(str, Enum):
|
||||||
|
MLX = "mlx"
|
||||||
|
TRANSFORMERS = "transformers"
|
||||||
|
TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
|
||||||
|
TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
|
||||||
|
TRANSFORMERS_LlavaForConditionalGeneration = (
|
||||||
|
"transformers-LlavaForConditionalGeneration"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
||||||
|
kind: Literal["hf_model_options"] = "hf_model_options"
|
||||||
|
|
||||||
|
repo_id: str
|
||||||
|
load_in_8bit: bool = True
|
||||||
|
llm_int8_threshold: float = 6.0
|
||||||
|
quantized: bool = False
|
||||||
|
|
||||||
|
inference_framework: InferenceFramework
|
||||||
|
response_format: ResponseFormat
|
||||||
|
|
||||||
|
scale: float = 2.0
|
||||||
|
|
||||||
|
temperature: float = 0.0
|
||||||
|
stop_strings: list[str] = []
|
||||||
|
|
||||||
|
use_kv_cache: bool = True
|
||||||
|
max_new_tokens: int = 4096
|
||||||
|
|
||||||
|
@property
|
||||||
|
def repo_cache_folder(self) -> str:
|
||||||
|
return self.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
|
class ApiVlmOptions(BaseVlmOptions):
|
||||||
|
kind: Literal["api_model_options"] = "api_model_options"
|
||||||
|
|
||||||
|
url: AnyUrl = AnyUrl(
|
||||||
|
"http://localhost:11434/v1/chat/completions"
|
||||||
|
) # Default to ollama
|
||||||
|
headers: Dict[str, str] = {}
|
||||||
|
params: Dict[str, Any] = {}
|
||||||
|
scale: float = 2.0
|
||||||
|
timeout: float = 60
|
||||||
|
concurrency: int = 1
|
||||||
|
response_format: ResponseFormat
|
||||||
|
|
||||||
|
|
||||||
class VlmPipelineOptions(PaginatedPipelineOptions):
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||||
generate_page_images: bool = True
|
generate_page_images: bool = True
|
||||||
force_backend_text: bool = (
|
force_backend_text: bool = (
|
||||||
|
@ -1,83 +1,22 @@
|
|||||||
import logging
|
import logging
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
|
||||||
|
|
||||||
from pydantic import (
|
from pydantic import (
|
||||||
AnyUrl,
|
AnyUrl,
|
||||||
BaseModel,
|
)
|
||||||
|
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
ApiVlmOptions,
|
||||||
|
HuggingFaceVlmOptions,
|
||||||
|
InferenceFramework,
|
||||||
|
ResponseFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseVlmOptions(BaseModel):
|
|
||||||
kind: str
|
|
||||||
prompt: str
|
|
||||||
|
|
||||||
|
|
||||||
class ResponseFormat(str, Enum):
|
|
||||||
DOCTAGS = "doctags"
|
|
||||||
MARKDOWN = "markdown"
|
|
||||||
HTML = "html"
|
|
||||||
|
|
||||||
|
|
||||||
class InferenceFramework(str, Enum):
|
|
||||||
MLX = "mlx"
|
|
||||||
TRANSFORMERS = "transformers"
|
|
||||||
TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
|
|
||||||
TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
|
|
||||||
TRANSFORMERS_LlavaForConditionalGeneration = (
|
|
||||||
"transformers-LlavaForConditionalGeneration"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
||||||
kind: Literal["hf_model_options"] = "hf_model_options"
|
|
||||||
|
|
||||||
repo_id: str
|
|
||||||
load_in_8bit: bool = True
|
|
||||||
llm_int8_threshold: float = 6.0
|
|
||||||
quantized: bool = False
|
|
||||||
|
|
||||||
inference_framework: InferenceFramework
|
|
||||||
response_format: ResponseFormat
|
|
||||||
|
|
||||||
scale: float = 2.0
|
|
||||||
|
|
||||||
temperature: float = 0.0
|
|
||||||
stop_strings: list[str] = []
|
|
||||||
|
|
||||||
use_kv_cache: bool = True
|
|
||||||
max_new_tokens: int = 4096
|
|
||||||
|
|
||||||
@property
|
|
||||||
def repo_cache_folder(self) -> str:
|
|
||||||
return self.repo_id.replace("/", "--")
|
|
||||||
|
|
||||||
|
|
||||||
class ApiVlmOptions(BaseVlmOptions):
|
|
||||||
kind: Literal["api_model_options"] = "api_model_options"
|
|
||||||
|
|
||||||
url: AnyUrl = AnyUrl(
|
|
||||||
"http://localhost:11434/v1/chat/completions"
|
|
||||||
) # Default to ollama
|
|
||||||
headers: Dict[str, str] = {}
|
|
||||||
params: Dict[str, Any] = {}
|
|
||||||
scale: float = 2.0
|
|
||||||
timeout: float = 60
|
|
||||||
concurrency: int = 1
|
|
||||||
response_format: ResponseFormat
|
|
||||||
|
|
||||||
|
|
||||||
class VlmModelType(str, Enum):
|
|
||||||
SMOLDOCLING = "smoldocling"
|
|
||||||
GRANITE_VISION = "granite_vision"
|
|
||||||
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
|
||||||
|
|
||||||
|
|
||||||
# SmolDocling
|
# SmolDocling
|
||||||
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
SMOLDOCLING_MLX = HuggingFaceVlmOptions(
|
||||||
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
||||||
prompt="Convert this page to docling.",
|
prompt="Convert this page to docling.",
|
||||||
response_format=ResponseFormat.DOCTAGS,
|
response_format=ResponseFormat.DOCTAGS,
|
||||||
@ -86,7 +25,7 @@ smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
|||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
SMOLDOCLING_TRANSFORMERS = HuggingFaceVlmOptions(
|
||||||
repo_id="ds4sd/SmolDocling-256M-preview",
|
repo_id="ds4sd/SmolDocling-256M-preview",
|
||||||
prompt="Convert this page to docling.",
|
prompt="Convert this page to docling.",
|
||||||
response_format=ResponseFormat.DOCTAGS,
|
response_format=ResponseFormat.DOCTAGS,
|
||||||
@ -96,7 +35,7 @@ smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# GraniteVision
|
# GraniteVision
|
||||||
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
GRANITE_VISION_TRANSFORMERS = HuggingFaceVlmOptions(
|
||||||
repo_id="ibm-granite/granite-vision-3.2-2b",
|
repo_id="ibm-granite/granite-vision-3.2-2b",
|
||||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
|
||||||
response_format=ResponseFormat.MARKDOWN,
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
@ -105,7 +44,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
GRANITE_VISION_OLLAMA = ApiVlmOptions(
|
||||||
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
||||||
params={"model": "granite3.2-vision:2b"},
|
params={"model": "granite3.2-vision:2b"},
|
||||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
|
||||||
@ -116,7 +55,7 @@ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Pixtral
|
# Pixtral
|
||||||
pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions(
|
PIXTRAL_12B_TRANSFORMERS = HuggingFaceVlmOptions(
|
||||||
repo_id="mistral-community/pixtral-12b",
|
repo_id="mistral-community/pixtral-12b",
|
||||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
response_format=ResponseFormat.MARKDOWN,
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
@ -125,7 +64,7 @@ pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions(
|
|||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
PIXTRAL_12B_MLX = HuggingFaceVlmOptions(
|
||||||
repo_id="mlx-community/pixtral-12b-bf16",
|
repo_id="mlx-community/pixtral-12b-bf16",
|
||||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
response_format=ResponseFormat.MARKDOWN,
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
@ -135,7 +74,7 @@ pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Phi4
|
# Phi4
|
||||||
phi_vlm_conversion_options = HuggingFaceVlmOptions(
|
PHI4_TRANSFORMERS = HuggingFaceVlmOptions(
|
||||||
repo_id="microsoft/Phi-4-multimodal-instruct",
|
repo_id="microsoft/Phi-4-multimodal-instruct",
|
||||||
prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
|
prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
|
||||||
response_format=ResponseFormat.MARKDOWN,
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
@ -145,7 +84,7 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Qwen
|
# Qwen
|
||||||
qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
QWEN25_VL_3B_MLX = HuggingFaceVlmOptions(
|
||||||
repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
|
repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
|
||||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
response_format=ResponseFormat.MARKDOWN,
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
@ -155,7 +94,7 @@ qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Gemma-3
|
# Gemma-3
|
||||||
gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
|
GEMMA3_12B_MLX = HuggingFaceVlmOptions(
|
||||||
repo_id="mlx-community/gemma-3-12b-it-bf16",
|
repo_id="mlx-community/gemma-3-12b-it-bf16",
|
||||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
response_format=ResponseFormat.MARKDOWN,
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
@ -164,7 +103,7 @@ gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
|
|||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions(
|
GEMMA3_27B_MLX = HuggingFaceVlmOptions(
|
||||||
repo_id="mlx-community/gemma-3-27b-it-bf16",
|
repo_id="mlx-community/gemma-3-27b-it-bf16",
|
||||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
response_format=ResponseFormat.MARKDOWN,
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
@ -172,3 +111,9 @@ gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions(
|
|||||||
scale=2.0,
|
scale=2.0,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class VlmModelType(str, Enum):
|
||||||
|
SMOLDOCLING = "smoldocling"
|
||||||
|
GRANITE_VISION = "granite_vision"
|
||||||
|
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
@ -27,13 +27,11 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat, Page
|
from docling.datamodel.base_models import InputFormat, Page
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_model_specializations import (
|
from docling.datamodel.pipeline_options import (
|
||||||
ApiVlmOptions,
|
ApiVlmOptions,
|
||||||
HuggingFaceVlmOptions,
|
HuggingFaceVlmOptions,
|
||||||
InferenceFramework,
|
InferenceFramework,
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
)
|
|
||||||
from docling.datamodel.pipeline_options import (
|
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
|
@ -2,14 +2,14 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from docling.datamodel.pipeline_model_specializations import (
|
|
||||||
smoldocling_vlm_conversion_options,
|
|
||||||
smoldocling_vlm_mlx_conversion_options,
|
|
||||||
)
|
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
granite_picture_description,
|
granite_picture_description,
|
||||||
smolvlm_picture_description,
|
smolvlm_picture_description,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_vlm_model_spec import (
|
||||||
|
SMOLDOCLING_MLX,
|
||||||
|
SMOLDOCLING_TRANSFORMERS,
|
||||||
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.code_formula_model import CodeFormulaModel
|
from docling.models.code_formula_model import CodeFormulaModel
|
||||||
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
||||||
@ -87,8 +87,8 @@ def download_models(
|
|||||||
if with_smoldocling:
|
if with_smoldocling:
|
||||||
_log.info("Downloading SmolDocling model...")
|
_log.info("Downloading SmolDocling model...")
|
||||||
HuggingFaceVlmModel.download_models(
|
HuggingFaceVlmModel.download_models(
|
||||||
repo_id=smoldocling_vlm_conversion_options.repo_id,
|
repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
|
||||||
local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
|
local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
|
||||||
force=force,
|
force=force,
|
||||||
progress=progress,
|
progress=progress,
|
||||||
)
|
)
|
||||||
@ -96,9 +96,8 @@ def download_models(
|
|||||||
if with_smoldocling_mlx:
|
if with_smoldocling_mlx:
|
||||||
_log.info("Downloading SmolDocling MLX model...")
|
_log.info("Downloading SmolDocling MLX model...")
|
||||||
HuggingFaceVlmModel.download_models(
|
HuggingFaceVlmModel.download_models(
|
||||||
repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
|
repo_id=SMOLDOCLING_MLX.repo_id,
|
||||||
local_dir=output_dir
|
local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
|
||||||
/ smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
|
|
||||||
force=force,
|
force=force,
|
||||||
progress=progress,
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
@ -13,20 +13,20 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
|||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_model_specializations import (
|
|
||||||
gemma_3_12b_mlx_conversion_options,
|
|
||||||
granite_vision_vlm_conversion_options,
|
|
||||||
granite_vision_vlm_ollama_conversion_options,
|
|
||||||
phi_vlm_conversion_options,
|
|
||||||
pixtral_12b_vlm_conversion_options,
|
|
||||||
pixtral_12b_vlm_mlx_conversion_options,
|
|
||||||
qwen25_vl_3b_vlm_mlx_conversion_options,
|
|
||||||
smoldocling_vlm_conversion_options,
|
|
||||||
smoldocling_vlm_mlx_conversion_options,
|
|
||||||
)
|
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_vlm_model_spec import (
|
||||||
|
GEMMA3_12B_MLX,
|
||||||
|
GRANITE_VISION_OLLAMA,
|
||||||
|
GRANITE_VISION_TRANSFORMERS,
|
||||||
|
PHI4_TRANSFORMERS,
|
||||||
|
PIXTRAL_12B_MLX,
|
||||||
|
PIXTRAL_12B_TRANSFORMERS,
|
||||||
|
QWEN25_VL_3B_MLX,
|
||||||
|
SMOLDOCLING_MLX,
|
||||||
|
SMOLDOCLING_TRANSFORMERS,
|
||||||
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
@ -120,16 +120,16 @@ if __name__ == "__main__":
|
|||||||
rows = []
|
rows = []
|
||||||
for vlm_options in [
|
for vlm_options in [
|
||||||
## DocTags / SmolDocling models
|
## DocTags / SmolDocling models
|
||||||
smoldocling_vlm_conversion_options,
|
SMOLDOCLING_TRANSFORMERS,
|
||||||
# smoldocling_vlm_mlx_conversion_options,
|
SMOLDOCLING_MLX,
|
||||||
## Markdown models (using MLX framework)
|
## Markdown models (using MLX framework)
|
||||||
# qwen25_vl_3b_vlm_mlx_conversion_options,
|
QWEN25_VL_3B_MLX,
|
||||||
# pixtral_12b_vlm_mlx_conversion_options,
|
PIXTRAL_12B_MLX,
|
||||||
# gemma_3_12b_mlx_conversion_options,
|
GEMMA3_12B_MLX,
|
||||||
## Markdown models (using Transformers framework)
|
## Markdown models (using Transformers framework)
|
||||||
# granite_vision_vlm_conversion_options,
|
GRANITE_VISION_TRANSFORMERS,
|
||||||
phi_vlm_conversion_options,
|
PHI4_TRANSFORMERS,
|
||||||
pixtral_12b_vlm_conversion_options,
|
PIXTRAL_12B_TRANSFORMERS,
|
||||||
]:
|
]:
|
||||||
pipeline_options.vlm_options = vlm_options
|
pipeline_options.vlm_options = vlm_options
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user