add new minimal_vlm example and refactor pipeline_options_vlm_model for cleaner import

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-06-01 18:24:04 +02:00
parent f63312add6
commit 2bd15cc809
11 changed files with 121 additions and 67 deletions

View File

@ -17,6 +17,12 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
from typing_extensions import deprecated
# Import the following for backwards compatibility
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
HuggingFaceVlmOptions,
InferenceFramework,
ResponseFormat,
)
from docling.datamodel.pipeline_vlm_model_spec import (
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
@ -305,62 +311,6 @@ class PaginatedPipelineOptions(PipelineOptions):
generate_picture_images: bool = False
class BaseVlmOptions(BaseModel):
kind: str
prompt: str
class ResponseFormat(str, Enum):
DOCTAGS = "doctags"
MARKDOWN = "markdown"
HTML = "html"
class InferenceFramework(str, Enum):
MLX = "mlx"
TRANSFORMERS = "transformers"
TRANSFORMERS_VISION2SEQ = "transformers-vision2seq"
TRANSFORMERS_CAUSALLM = "transformers-causallm"
class HuggingFaceVlmOptions(BaseVlmOptions):
kind: Literal["hf_model_options"] = "hf_model_options"
repo_id: str
load_in_8bit: bool = True
llm_int8_threshold: float = 6.0
quantized: bool = False
inference_framework: InferenceFramework
response_format: ResponseFormat
scale: float = 2.0
temperature: float = 0.0
stop_strings: list[str] = []
use_kv_cache: bool = True
max_new_tokens: int = 4096
@property
def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--")
class ApiVlmOptions(BaseVlmOptions):
kind: Literal["api_model_options"] = "api_model_options"
url: AnyUrl = AnyUrl(
"http://localhost:11434/v1/chat/completions"
) # Default to ollama
headers: Dict[str, str] = {}
params: Dict[str, Any] = {}
scale: float = 2.0
timeout: float = 60
concurrency: int = 1
response_format: ResponseFormat
class VlmPipelineOptions(PaginatedPipelineOptions):
generate_page_images: bool = True
force_backend_text: bool = (

View File

@ -0,0 +1,60 @@
from enum import Enum
from typing import Any, Dict, Literal
from pydantic import AnyUrl, BaseModel
class BaseVlmOptions(BaseModel):
kind: str
prompt: str
class ResponseFormat(str, Enum):
DOCTAGS = "doctags"
MARKDOWN = "markdown"
HTML = "html"
class InferenceFramework(str, Enum):
MLX = "mlx"
TRANSFORMERS = "transformers"
TRANSFORMERS_VISION2SEQ = "transformers-vision2seq"
TRANSFORMERS_CAUSALLM = "transformers-causallm"
class HuggingFaceVlmOptions(BaseVlmOptions):
kind: Literal["hf_model_options"] = "hf_model_options"
repo_id: str
load_in_8bit: bool = True
llm_int8_threshold: float = 6.0
quantized: bool = False
inference_framework: InferenceFramework
response_format: ResponseFormat
scale: float = 2.0
temperature: float = 0.0
stop_strings: list[str] = []
use_kv_cache: bool = True
max_new_tokens: int = 4096
@property
def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--")
class ApiVlmOptions(BaseVlmOptions):
kind: Literal["api_model_options"] = "api_model_options"
url: AnyUrl = AnyUrl(
"http://localhost:11434/v1/chat/completions"
) # Default to ollama
headers: Dict[str, str] = {}
params: Dict[str, Any] = {}
scale: float = 2.0
timeout: float = 60
concurrency: int = 1
response_format: ResponseFormat

View File

@ -5,7 +5,7 @@ from pydantic import (
AnyUrl,
)
from docling.datamodel.pipeline_options import (
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
HuggingFaceVlmOptions,
InferenceFramework,

View File

@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ApiVlmOptions
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
from docling.exceptions import OperationNotAllowed
from docling.models.base_model import BasePageModel
from docling.utils.api_image_request import api_image_request

View File

@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
HuggingFaceVlmOptions,
)
from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions
from docling.models.base_model import BasePageModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel
from docling.utils.accelerator_utils import decide_device

View File

@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
HuggingFaceVlmOptions,
)
from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions
from docling.models.base_model import BasePageModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel
from docling.utils.accelerator_utils import decide_device

View File

@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToke
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
HuggingFaceVlmOptions,
)
from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions
from docling.models.base_model import BasePageModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel
from docling.utils.profiling import TimeRecorder

View File

@ -4,7 +4,6 @@ from io import BytesIO
from pathlib import Path
from typing import List, Optional, Union, cast
# from docling_core.types import DoclingDocument
from docling_core.types.doc import (
BoundingBox,
DocItem,
@ -28,11 +27,13 @@ from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import InputFormat, Page
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import (
VlmPipelineOptions,
)
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
HuggingFaceVlmOptions,
InferenceFramework,
ResponseFormat,
VlmPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.models.api_vlm_model import ApiVlmModel
@ -42,8 +43,6 @@ from docling.models.vlm_models_inline.hf_transformers_causallm_model import (
from docling.models.vlm_models_inline.hf_transformers_vision2seq_model import (
HuggingFaceVlmModel_AutoModelForVision2Seq,
)
# from docling.models.hf_vlm_model import HuggingFaceVlmModel
from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder

View File

@ -0,0 +1,45 @@
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
VlmPipelineOptions,
)
from docling.datamodel.pipeline_vlm_model_spec import SMOLDOCLING_MLX
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
source = "https://arxiv.org/pdf/2501.17887"
###### USING SIMPLE DEFAULT VALUES
# - SmolDocling model
# - Using the transformers framework
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=VlmPipeline,
),
}
)
doc = converter.convert(source=source).document
print(doc.export_to_markdown())
###### USING MACOS MPS ACCELERATOR
pipeline_options = VlmPipelineOptions(
vlm_options=SMOLDOCLING_MLX,
)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=VlmPipeline,
pipeline_options=pipeline_options,
),
}
)
doc = converter.convert(source=source).document
print(doc.export_to_markdown())

View File

@ -7,10 +7,9 @@ from dotenv import load_dotenv
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
ApiVlmOptions,
ResponseFormat,
VlmPipelineOptions,
)
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline

View File

@ -78,6 +78,7 @@ nav:
- "Multi-format conversion": examples/run_with_formats.py
- "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
- "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
- "VLM comparison": examples/compare_vlm_models.py
- "Figure export": examples/export_figures.py
- "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py