diff --git a/docling/cli/main.py b/docling/cli/main.py index cd2f040b..88edde0b 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -29,13 +29,6 @@ from docling.datamodel.base_models import ( OutputFormat, ) from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_model_specializations import ( - VlmModelType, - granite_vision_vlm_conversion_options, - granite_vision_vlm_ollama_conversion_options, - smoldocling_vlm_conversion_options, - smoldocling_vlm_mlx_conversion_options, -) from docling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, @@ -48,6 +41,13 @@ from docling.datamodel.pipeline_options import ( TableFormerMode, VlmPipelineOptions, ) +from docling.datamodel.pipeline_vlm_model_spec import ( + GRANITE_VISION_OLLAMA, + GRANITE_VISION_TRANSFORMERS, + SMOLDOCLING_MLX, + SMOLDOCLING_TRANSFORMERS, + VlmModelType, +) from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.models.factories import get_ocr_factory @@ -549,20 +549,16 @@ def convert( # noqa: C901 ) if vlm_model == VlmModelType.GRANITE_VISION: - pipeline_options.vlm_options = granite_vision_vlm_conversion_options + pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA: - pipeline_options.vlm_options = ( - granite_vision_vlm_ollama_conversion_options - ) + pipeline_options.vlm_options = GRANITE_VISION_OLLAMA elif vlm_model == VlmModelType.SMOLDOCLING: - pipeline_options.vlm_options = smoldocling_vlm_conversion_options + pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS if sys.platform == "darwin": try: import mlx_vlm - pipeline_options.vlm_options = ( - smoldocling_vlm_mlx_conversion_options - ) + pipeline_options.vlm_options = SMOLDOCLING_MLX except ImportError: _log.warning( "To run SmolDocling faster, please install mlx-vlm:\n" diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 79d762b3..133a31fd 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -16,10 +16,13 @@ from pydantic import ( from pydantic_settings import BaseSettings, SettingsConfigDict from typing_extensions import deprecated -from docling.datamodel.pipeline_model_specializations import ( - ApiVlmOptions, - HuggingFaceVlmOptions, - smoldocling_vlm_conversion_options, +# Import the following for backwards compatibility +from docling.datamodel.pipeline_vlm_model_spec import ( + GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options, + GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options, + SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options, + SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options, + VlmModelType, ) _log = logging.getLogger(__name__) @@ -302,6 +305,65 @@ class PaginatedPipelineOptions(PipelineOptions): generate_picture_images: bool = False +class BaseVlmOptions(BaseModel): + kind: str + prompt: str + + +class ResponseFormat(str, Enum): + DOCTAGS = "doctags" + MARKDOWN = "markdown" + HTML = "html" + + +class InferenceFramework(str, Enum): + MLX = "mlx" + TRANSFORMERS = "transformers" + TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq" + TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM" + TRANSFORMERS_LlavaForConditionalGeneration = ( + "transformers-LlavaForConditionalGeneration" + ) + + +class HuggingFaceVlmOptions(BaseVlmOptions): + kind: Literal["hf_model_options"] = "hf_model_options" + + repo_id: str + load_in_8bit: bool = True + llm_int8_threshold: float = 6.0 + quantized: bool = False + + inference_framework: InferenceFramework + response_format: ResponseFormat + + scale: float = 2.0 + + temperature: float = 0.0 + stop_strings: list[str] = [] + + use_kv_cache: bool = True + max_new_tokens: int = 4096 + + @property + def repo_cache_folder(self) -> str: + return self.repo_id.replace("/", "--") + + +class ApiVlmOptions(BaseVlmOptions): + kind: Literal["api_model_options"] = "api_model_options" + + url: AnyUrl = AnyUrl( + "http://localhost:11434/v1/chat/completions" + ) # Default to ollama + headers: Dict[str, str] = {} + params: Dict[str, Any] = {} + scale: float = 2.0 + timeout: float = 60 + concurrency: int = 1 + response_format: ResponseFormat + + class VlmPipelineOptions(PaginatedPipelineOptions): generate_page_images: bool = True force_backend_text: bool = ( diff --git a/docling/datamodel/pipeline_model_specializations.py b/docling/datamodel/pipeline_vlm_model_spec.py similarity index 59% rename from docling/datamodel/pipeline_model_specializations.py rename to docling/datamodel/pipeline_vlm_model_spec.py index 33d968ff..6547600d 100644 --- a/docling/datamodel/pipeline_model_specializations.py +++ b/docling/datamodel/pipeline_vlm_model_spec.py @@ -1,83 +1,22 @@ import logging from enum import Enum -from pathlib import Path -from typing import Any, ClassVar, Dict, List, Literal, Optional, Union from pydantic import ( AnyUrl, - BaseModel, +) + +from docling.datamodel.pipeline_options import ( + ApiVlmOptions, + HuggingFaceVlmOptions, + InferenceFramework, + ResponseFormat, ) _log = logging.getLogger(__name__) -class BaseVlmOptions(BaseModel): - kind: str - prompt: str - - -class ResponseFormat(str, Enum): - DOCTAGS = "doctags" - MARKDOWN = "markdown" - HTML = "html" - - -class InferenceFramework(str, Enum): - MLX = "mlx" - TRANSFORMERS = "transformers" - TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq" - TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM" - TRANSFORMERS_LlavaForConditionalGeneration = ( - "transformers-LlavaForConditionalGeneration" - ) - - -class HuggingFaceVlmOptions(BaseVlmOptions): - kind: Literal["hf_model_options"] = "hf_model_options" - - repo_id: str - load_in_8bit: bool = True - llm_int8_threshold: float = 6.0 - quantized: bool = False - - inference_framework: InferenceFramework - response_format: ResponseFormat - - scale: float = 2.0 - - temperature: float = 0.0 - stop_strings: list[str] = [] - - use_kv_cache: bool = True - max_new_tokens: int = 4096 - - @property - def repo_cache_folder(self) -> str: - return self.repo_id.replace("/", "--") - - -class ApiVlmOptions(BaseVlmOptions): - kind: Literal["api_model_options"] = "api_model_options" - - url: AnyUrl = AnyUrl( - "http://localhost:11434/v1/chat/completions" - ) # Default to ollama - headers: Dict[str, str] = {} - params: Dict[str, Any] = {} - scale: float = 2.0 - timeout: float = 60 - concurrency: int = 1 - response_format: ResponseFormat - - -class VlmModelType(str, Enum): - SMOLDOCLING = "smoldocling" - GRANITE_VISION = "granite_vision" - GRANITE_VISION_OLLAMA = "granite_vision_ollama" - - # SmolDocling -smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions( +SMOLDOCLING_MLX = HuggingFaceVlmOptions( repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16", prompt="Convert this page to docling.", response_format=ResponseFormat.DOCTAGS, @@ -86,7 +25,7 @@ smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions( temperature=0.0, ) -smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( +SMOLDOCLING_TRANSFORMERS = HuggingFaceVlmOptions( repo_id="ds4sd/SmolDocling-256M-preview", prompt="Convert this page to docling.", response_format=ResponseFormat.DOCTAGS, @@ -96,7 +35,7 @@ smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( ) # GraniteVision -granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( +GRANITE_VISION_TRANSFORMERS = HuggingFaceVlmOptions( repo_id="ibm-granite/granite-vision-3.2-2b", prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", response_format=ResponseFormat.MARKDOWN, @@ -105,7 +44,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( temperature=0.0, ) -granite_vision_vlm_ollama_conversion_options = ApiVlmOptions( +GRANITE_VISION_OLLAMA = ApiVlmOptions( url=AnyUrl("http://localhost:11434/v1/chat/completions"), params={"model": "granite3.2-vision:2b"}, prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", @@ -116,7 +55,7 @@ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions( ) # Pixtral -pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions( +PIXTRAL_12B_TRANSFORMERS = HuggingFaceVlmOptions( repo_id="mistral-community/pixtral-12b", prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, @@ -125,7 +64,7 @@ pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions( temperature=0.0, ) -pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( +PIXTRAL_12B_MLX = HuggingFaceVlmOptions( repo_id="mlx-community/pixtral-12b-bf16", prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, @@ -135,7 +74,7 @@ pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( ) # Phi4 -phi_vlm_conversion_options = HuggingFaceVlmOptions( +PHI4_TRANSFORMERS = HuggingFaceVlmOptions( repo_id="microsoft/Phi-4-multimodal-instruct", prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown", response_format=ResponseFormat.MARKDOWN, @@ -145,7 +84,7 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions( ) # Qwen -qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( +QWEN25_VL_3B_MLX = HuggingFaceVlmOptions( repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16", prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, @@ -155,7 +94,7 @@ qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( ) # Gemma-3 -gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions( +GEMMA3_12B_MLX = HuggingFaceVlmOptions( repo_id="mlx-community/gemma-3-12b-it-bf16", prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, @@ -164,7 +103,7 @@ gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions( temperature=0.0, ) -gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions( +GEMMA3_27B_MLX = HuggingFaceVlmOptions( repo_id="mlx-community/gemma-3-27b-it-bf16", prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, @@ -172,3 +111,9 @@ gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions( scale=2.0, temperature=0.0, ) + + +class VlmModelType(str, Enum): + SMOLDOCLING = "smoldocling" + GRANITE_VISION = "granite_vision" + GRANITE_VISION_OLLAMA = "granite_vision_ollama" diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 8acd471c..0c6237e1 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -27,13 +27,11 @@ from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_model_specializations import ( +from docling.datamodel.pipeline_options import ( ApiVlmOptions, HuggingFaceVlmOptions, InferenceFramework, ResponseFormat, -) -from docling.datamodel.pipeline_options import ( VlmPipelineOptions, ) from docling.datamodel.settings import settings diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py index e2b4b194..585f22a6 100644 --- a/docling/utils/model_downloader.py +++ b/docling/utils/model_downloader.py @@ -2,14 +2,14 @@ import logging from pathlib import Path from typing import Optional -from docling.datamodel.pipeline_model_specializations import ( - smoldocling_vlm_conversion_options, - smoldocling_vlm_mlx_conversion_options, -) from docling.datamodel.pipeline_options import ( granite_picture_description, smolvlm_picture_description, ) +from docling.datamodel.pipeline_vlm_model_spec import ( + SMOLDOCLING_MLX, + SMOLDOCLING_TRANSFORMERS, +) from docling.datamodel.settings import settings from docling.models.code_formula_model import CodeFormulaModel from docling.models.document_picture_classifier import DocumentPictureClassifier @@ -87,8 +87,8 @@ def download_models( if with_smoldocling: _log.info("Downloading SmolDocling model...") HuggingFaceVlmModel.download_models( - repo_id=smoldocling_vlm_conversion_options.repo_id, - local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder, + repo_id=SMOLDOCLING_TRANSFORMERS.repo_id, + local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder, force=force, progress=progress, ) @@ -96,9 +96,8 @@ def download_models( if with_smoldocling_mlx: _log.info("Downloading SmolDocling MLX model...") HuggingFaceVlmModel.download_models( - repo_id=smoldocling_vlm_mlx_conversion_options.repo_id, - local_dir=output_dir - / smoldocling_vlm_mlx_conversion_options.repo_cache_folder, + repo_id=SMOLDOCLING_MLX.repo_id, + local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder, force=force, progress=progress, ) diff --git a/docs/examples/compare_vlm_models.py b/docs/examples/compare_vlm_models.py index 2d8915de..457c7d58 100644 --- a/docs/examples/compare_vlm_models.py +++ b/docs/examples/compare_vlm_models.py @@ -13,20 +13,20 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS from tabulate import tabulate from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_model_specializations import ( - gemma_3_12b_mlx_conversion_options, - granite_vision_vlm_conversion_options, - granite_vision_vlm_ollama_conversion_options, - phi_vlm_conversion_options, - pixtral_12b_vlm_conversion_options, - pixtral_12b_vlm_mlx_conversion_options, - qwen25_vl_3b_vlm_mlx_conversion_options, - smoldocling_vlm_conversion_options, - smoldocling_vlm_mlx_conversion_options, -) from docling.datamodel.pipeline_options import ( VlmPipelineOptions, ) +from docling.datamodel.pipeline_vlm_model_spec import ( + GEMMA3_12B_MLX, + GRANITE_VISION_OLLAMA, + GRANITE_VISION_TRANSFORMERS, + PHI4_TRANSFORMERS, + PIXTRAL_12B_MLX, + PIXTRAL_12B_TRANSFORMERS, + QWEN25_VL_3B_MLX, + SMOLDOCLING_MLX, + SMOLDOCLING_TRANSFORMERS, +) from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline @@ -120,16 +120,16 @@ if __name__ == "__main__": rows = [] for vlm_options in [ ## DocTags / SmolDocling models - smoldocling_vlm_conversion_options, - # smoldocling_vlm_mlx_conversion_options, + SMOLDOCLING_TRANSFORMERS, + SMOLDOCLING_MLX, ## Markdown models (using MLX framework) - # qwen25_vl_3b_vlm_mlx_conversion_options, - # pixtral_12b_vlm_mlx_conversion_options, - # gemma_3_12b_mlx_conversion_options, + QWEN25_VL_3B_MLX, + PIXTRAL_12B_MLX, + GEMMA3_12B_MLX, ## Markdown models (using Transformers framework) - # granite_vision_vlm_conversion_options, - phi_vlm_conversion_options, - pixtral_12b_vlm_conversion_options, + GRANITE_VISION_TRANSFORMERS, + PHI4_TRANSFORMERS, + PIXTRAL_12B_TRANSFORMERS, ]: pipeline_options.vlm_options = vlm_options