From 15a8f328c2745e6958d42904c4248f7706f096e4 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Thu, 15 May 2025 05:27:16 +0200 Subject: [PATCH] added pipeline_model_specializations file Signed-off-by: Peter Staar --- .../pipeline_model_specializations.py | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 docling/datamodel/pipeline_model_specializations.py diff --git a/docling/datamodel/pipeline_model_specializations.py b/docling/datamodel/pipeline_model_specializations.py new file mode 100644 index 00000000..68db935d --- /dev/null +++ b/docling/datamodel/pipeline_model_specializations.py @@ -0,0 +1,142 @@ +import logging +from enum import Enum +from pathlib import Path +from typing import Any, ClassVar, Dict, List, Literal, Optional, Union + +from pydantic import ( + AnyUrl, + BaseModel, +) + +_log = logging.getLogger(__name__) + + +class BaseVlmOptions(BaseModel): + kind: str + prompt: str + + +class ResponseFormat(str, Enum): + DOCTAGS = "doctags" + MARKDOWN = "markdown" + HTML = "html" + + +class InferenceFramework(str, Enum): + MLX = "mlx" + TRANSFORMERS = "transformers" + OPENAI = "openai" + TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq" + TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM" + TRANSFORMERS_LlavaForConditionalGeneration = ( + "transformers-LlavaForConditionalGeneration" + ) + + +class HuggingFaceVlmOptions(BaseVlmOptions): + kind: Literal["hf_model_options"] = "hf_model_options" + + repo_id: str + load_in_8bit: bool = True + llm_int8_threshold: float = 6.0 + quantized: bool = False + + inference_framework: InferenceFramework + response_format: ResponseFormat + + scale: float = 2.0 + + use_kv_cache: bool = True + max_new_tokens: int = 4096 + + @property + def repo_cache_folder(self) -> str: + return self.repo_id.replace("/", "--") + + +class ApiVlmOptions(BaseVlmOptions): + kind: Literal["api_model_options"] = "api_model_options" + + url: AnyUrl = AnyUrl( + "http://localhost:11434/v1/chat/completions" + ) # Default to ollama + headers: Dict[str, str] = {} + params: Dict[str, Any] = {} + scale: float = 2.0 + timeout: float = 60 + response_format: ResponseFormat + + +class VlmModelType(str, Enum): + SMOLDOCLING = "smoldocling" + GRANITE_VISION = "granite_vision" + GRANITE_VISION_OLLAMA = "granite_vision_ollama" + + +# SmolDocling +smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions( + repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16", + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, + inference_framework=InferenceFramework.MLX, + scale=2.0, +) + +smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( + repo_id="ds4sd/SmolDocling-256M-preview", + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, + inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq, + scale=2.0, +) + +# GraniteVision +granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( + repo_id="ibm-granite/granite-vision-3.2-2b", + prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq, + scale=2.0, +) + +granite_vision_vlm_ollama_conversion_options = ApiVlmOptions( + url=AnyUrl("http://localhost:11434/v1/chat/completions"), + params={"model": "granite3.2-vision:2b"}, + prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", + scale=1.0, + timeout=120, + response_format=ResponseFormat.MARKDOWN, +) + +# Pixtral +pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions( + repo_id="mistral-community/pixtral-12b", + prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration, + scale=2.0, +) + +pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( + repo_id="mlx-community/pixtral-12b-bf16", + prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.MLX, + scale=2.0, +) + +# Phi4 +phi_vlm_conversion_options = HuggingFaceVlmOptions( + repo_id="microsoft/Phi-4-multimodal-instruct", + prompt="Convert this page to MarkDown. Do not miss any text and only output the bare MarkDown", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM, +) + +# Qwen +qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( + repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16", + prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.MLX, +)