From 2bd15cc809d85b8f85fd81afd5b95800fa14aec5 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Sun, 1 Jun 2025 18:24:04 +0200 Subject: [PATCH] add new minimal_vlm example and refactor pipeline_options_vlm_model for cleaner import Signed-off-by: Michele Dolfi --- docling/datamodel/pipeline_options.py | 62 ++----------------- .../datamodel/pipeline_options_vlm_model.py | 60 ++++++++++++++++++ docling/datamodel/pipeline_vlm_model_spec.py | 2 +- docling/models/api_vlm_model.py | 2 +- .../hf_transformers_causallm_model.py | 2 +- .../hf_transformers_vision2seq_model.py | 2 +- docling/models/vlm_models_inline/mlx_model.py | 2 +- docling/pipeline/vlm_pipeline.py | 7 +-- docs/examples/minimal_vlm_pipeline.py | 45 ++++++++++++++ docs/examples/vlm_pipeline_api_model.py | 3 +- mkdocs.yml | 1 + 11 files changed, 121 insertions(+), 67 deletions(-) create mode 100644 docling/datamodel/pipeline_options_vlm_model.py create mode 100644 docs/examples/minimal_vlm_pipeline.py diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 46123835..4c73a5b1 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -17,6 +17,12 @@ from pydantic_settings import BaseSettings, SettingsConfigDict from typing_extensions import deprecated # Import the following for backwards compatibility +from docling.datamodel.pipeline_options_vlm_model import ( + ApiVlmOptions, + HuggingFaceVlmOptions, + InferenceFramework, + ResponseFormat, +) from docling.datamodel.pipeline_vlm_model_spec import ( GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options, GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options, @@ -305,62 +311,6 @@ class PaginatedPipelineOptions(PipelineOptions): generate_picture_images: bool = False -class BaseVlmOptions(BaseModel): - kind: str - prompt: str - - -class ResponseFormat(str, Enum): - DOCTAGS = "doctags" - MARKDOWN = "markdown" - HTML = "html" - - -class InferenceFramework(str, Enum): - MLX = "mlx" - TRANSFORMERS = "transformers" - TRANSFORMERS_VISION2SEQ = "transformers-vision2seq" - TRANSFORMERS_CAUSALLM = "transformers-causallm" - - -class HuggingFaceVlmOptions(BaseVlmOptions): - kind: Literal["hf_model_options"] = "hf_model_options" - - repo_id: str - load_in_8bit: bool = True - llm_int8_threshold: float = 6.0 - quantized: bool = False - - inference_framework: InferenceFramework - response_format: ResponseFormat - - scale: float = 2.0 - - temperature: float = 0.0 - stop_strings: list[str] = [] - - use_kv_cache: bool = True - max_new_tokens: int = 4096 - - @property - def repo_cache_folder(self) -> str: - return self.repo_id.replace("/", "--") - - -class ApiVlmOptions(BaseVlmOptions): - kind: Literal["api_model_options"] = "api_model_options" - - url: AnyUrl = AnyUrl( - "http://localhost:11434/v1/chat/completions" - ) # Default to ollama - headers: Dict[str, str] = {} - params: Dict[str, Any] = {} - scale: float = 2.0 - timeout: float = 60 - concurrency: int = 1 - response_format: ResponseFormat - - class VlmPipelineOptions(PaginatedPipelineOptions): generate_page_images: bool = True force_backend_text: bool = ( diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py new file mode 100644 index 00000000..544e6fd7 --- /dev/null +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -0,0 +1,60 @@ +from enum import Enum +from typing import Any, Dict, Literal + +from pydantic import AnyUrl, BaseModel + + +class BaseVlmOptions(BaseModel): + kind: str + prompt: str + + +class ResponseFormat(str, Enum): + DOCTAGS = "doctags" + MARKDOWN = "markdown" + HTML = "html" + + +class InferenceFramework(str, Enum): + MLX = "mlx" + TRANSFORMERS = "transformers" + TRANSFORMERS_VISION2SEQ = "transformers-vision2seq" + TRANSFORMERS_CAUSALLM = "transformers-causallm" + + +class HuggingFaceVlmOptions(BaseVlmOptions): + kind: Literal["hf_model_options"] = "hf_model_options" + + repo_id: str + load_in_8bit: bool = True + llm_int8_threshold: float = 6.0 + quantized: bool = False + + inference_framework: InferenceFramework + response_format: ResponseFormat + + scale: float = 2.0 + + temperature: float = 0.0 + stop_strings: list[str] = [] + + use_kv_cache: bool = True + max_new_tokens: int = 4096 + + @property + def repo_cache_folder(self) -> str: + return self.repo_id.replace("/", "--") + + +class ApiVlmOptions(BaseVlmOptions): + kind: Literal["api_model_options"] = "api_model_options" + + url: AnyUrl = AnyUrl( + "http://localhost:11434/v1/chat/completions" + ) # Default to ollama + headers: Dict[str, str] = {} + params: Dict[str, Any] = {} + scale: float = 2.0 + timeout: float = 60 + concurrency: int = 1 + response_format: ResponseFormat diff --git a/docling/datamodel/pipeline_vlm_model_spec.py b/docling/datamodel/pipeline_vlm_model_spec.py index 71b2ebe5..36cbc2d0 100644 --- a/docling/datamodel/pipeline_vlm_model_spec.py +++ b/docling/datamodel/pipeline_vlm_model_spec.py @@ -5,7 +5,7 @@ from pydantic import ( AnyUrl, ) -from docling.datamodel.pipeline_options import ( +from docling.datamodel.pipeline_options_vlm_model import ( ApiVlmOptions, HuggingFaceVlmOptions, InferenceFramework, diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py index 60bc6fce..30bc43ea 100644 --- a/docling/models/api_vlm_model.py +++ b/docling/models/api_vlm_model.py @@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ApiVlmOptions +from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions from docling.exceptions import OperationNotAllowed from docling.models.base_model import BasePageModel from docling.utils.api_image_request import api_image_request diff --git a/docling/models/vlm_models_inline/hf_transformers_causallm_model.py b/docling/models/vlm_models_inline/hf_transformers_causallm_model.py index 99dc32b1..4f3a3d3f 100644 --- a/docling/models/vlm_models_inline/hf_transformers_causallm_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_causallm_model.py @@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( AcceleratorOptions, - HuggingFaceVlmOptions, ) +from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions from docling.models.base_model import BasePageModel from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.utils.accelerator_utils import decide_device diff --git a/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py b/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py index 6de02808..91e04087 100644 --- a/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py @@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( AcceleratorOptions, - HuggingFaceVlmOptions, ) +from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions from docling.models.base_model import BasePageModel from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.utils.accelerator_utils import decide_device diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py index 4e724191..099bb42c 100644 --- a/docling/models/vlm_models_inline/mlx_model.py +++ b/docling/models/vlm_models_inline/mlx_model.py @@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToke from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( AcceleratorOptions, - HuggingFaceVlmOptions, ) +from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions from docling.models.base_model import BasePageModel from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.utils.profiling import TimeRecorder diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 67723e2c..fd4a4375 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -4,7 +4,6 @@ from io import BytesIO from pathlib import Path from typing import List, Optional, Union, cast -# from docling_core.types import DoclingDocument from docling_core.types.doc import ( BoundingBox, DocItem, @@ -28,11 +27,13 @@ from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.pipeline_options import ( + VlmPipelineOptions, +) +from docling.datamodel.pipeline_options_vlm_model import ( ApiVlmOptions, HuggingFaceVlmOptions, InferenceFramework, ResponseFormat, - VlmPipelineOptions, ) from docling.datamodel.settings import settings from docling.models.api_vlm_model import ApiVlmModel @@ -42,8 +43,6 @@ from docling.models.vlm_models_inline.hf_transformers_causallm_model import ( from docling.models.vlm_models_inline.hf_transformers_vision2seq_model import ( HuggingFaceVlmModel_AutoModelForVision2Seq, ) - -# from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel from docling.pipeline.base_pipeline import PaginatedPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py new file mode 100644 index 00000000..d99a94f8 --- /dev/null +++ b/docs/examples/minimal_vlm_pipeline.py @@ -0,0 +1,45 @@ +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + VlmPipelineOptions, +) +from docling.datamodel.pipeline_vlm_model_spec import SMOLDOCLING_MLX +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.pipeline.vlm_pipeline import VlmPipeline + +source = "https://arxiv.org/pdf/2501.17887" + +###### USING SIMPLE DEFAULT VALUES +# - SmolDocling model +# - Using the transformers framework + +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + ), + } +) + +doc = converter.convert(source=source).document + +print(doc.export_to_markdown()) + + +###### USING MACOS MPS ACCELERATOR + +pipeline_options = VlmPipelineOptions( + vlm_options=SMOLDOCLING_MLX, +) + +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ), + } +) + +doc = converter.convert(source=source).document + +print(doc.export_to_markdown()) diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py index 504cecc5..ec29e21c 100644 --- a/docs/examples/vlm_pipeline_api_model.py +++ b/docs/examples/vlm_pipeline_api_model.py @@ -7,10 +7,9 @@ from dotenv import load_dotenv from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( - ApiVlmOptions, - ResponseFormat, VlmPipelineOptions, ) +from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline diff --git a/mkdocs.yml b/mkdocs.yml index 2e40158e..1f42be9d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -78,6 +78,7 @@ nav: - "Multi-format conversion": examples/run_with_formats.py - "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py - "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py + - "VLM comparison": examples/compare_vlm_models.py - "Figure export": examples/export_figures.py - "Table export": examples/export_tables.py - "Multimodal export": examples/export_multimodal.py