mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
add new minimal_vlm example and refactor pipeline_options_vlm_model for cleaner import
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
f63312add6
commit
2bd15cc809
@ -17,6 +17,12 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
from typing_extensions import deprecated
|
||||
|
||||
# Import the following for backwards compatibility
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
InferenceFramework,
|
||||
ResponseFormat,
|
||||
)
|
||||
from docling.datamodel.pipeline_vlm_model_spec import (
|
||||
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
||||
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
||||
@ -305,62 +311,6 @@ class PaginatedPipelineOptions(PipelineOptions):
|
||||
generate_picture_images: bool = False
|
||||
|
||||
|
||||
class BaseVlmOptions(BaseModel):
|
||||
kind: str
|
||||
prompt: str
|
||||
|
||||
|
||||
class ResponseFormat(str, Enum):
|
||||
DOCTAGS = "doctags"
|
||||
MARKDOWN = "markdown"
|
||||
HTML = "html"
|
||||
|
||||
|
||||
class InferenceFramework(str, Enum):
|
||||
MLX = "mlx"
|
||||
TRANSFORMERS = "transformers"
|
||||
TRANSFORMERS_VISION2SEQ = "transformers-vision2seq"
|
||||
TRANSFORMERS_CAUSALLM = "transformers-causallm"
|
||||
|
||||
|
||||
class HuggingFaceVlmOptions(BaseVlmOptions):
|
||||
kind: Literal["hf_model_options"] = "hf_model_options"
|
||||
|
||||
repo_id: str
|
||||
load_in_8bit: bool = True
|
||||
llm_int8_threshold: float = 6.0
|
||||
quantized: bool = False
|
||||
|
||||
inference_framework: InferenceFramework
|
||||
response_format: ResponseFormat
|
||||
|
||||
scale: float = 2.0
|
||||
|
||||
temperature: float = 0.0
|
||||
stop_strings: list[str] = []
|
||||
|
||||
use_kv_cache: bool = True
|
||||
max_new_tokens: int = 4096
|
||||
|
||||
@property
|
||||
def repo_cache_folder(self) -> str:
|
||||
return self.repo_id.replace("/", "--")
|
||||
|
||||
|
||||
class ApiVlmOptions(BaseVlmOptions):
|
||||
kind: Literal["api_model_options"] = "api_model_options"
|
||||
|
||||
url: AnyUrl = AnyUrl(
|
||||
"http://localhost:11434/v1/chat/completions"
|
||||
) # Default to ollama
|
||||
headers: Dict[str, str] = {}
|
||||
params: Dict[str, Any] = {}
|
||||
scale: float = 2.0
|
||||
timeout: float = 60
|
||||
concurrency: int = 1
|
||||
response_format: ResponseFormat
|
||||
|
||||
|
||||
class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||
generate_page_images: bool = True
|
||||
force_backend_text: bool = (
|
||||
|
60
docling/datamodel/pipeline_options_vlm_model.py
Normal file
60
docling/datamodel/pipeline_options_vlm_model.py
Normal file
@ -0,0 +1,60 @@
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Literal
|
||||
|
||||
from pydantic import AnyUrl, BaseModel
|
||||
|
||||
|
||||
class BaseVlmOptions(BaseModel):
|
||||
kind: str
|
||||
prompt: str
|
||||
|
||||
|
||||
class ResponseFormat(str, Enum):
|
||||
DOCTAGS = "doctags"
|
||||
MARKDOWN = "markdown"
|
||||
HTML = "html"
|
||||
|
||||
|
||||
class InferenceFramework(str, Enum):
|
||||
MLX = "mlx"
|
||||
TRANSFORMERS = "transformers"
|
||||
TRANSFORMERS_VISION2SEQ = "transformers-vision2seq"
|
||||
TRANSFORMERS_CAUSALLM = "transformers-causallm"
|
||||
|
||||
|
||||
class HuggingFaceVlmOptions(BaseVlmOptions):
|
||||
kind: Literal["hf_model_options"] = "hf_model_options"
|
||||
|
||||
repo_id: str
|
||||
load_in_8bit: bool = True
|
||||
llm_int8_threshold: float = 6.0
|
||||
quantized: bool = False
|
||||
|
||||
inference_framework: InferenceFramework
|
||||
response_format: ResponseFormat
|
||||
|
||||
scale: float = 2.0
|
||||
|
||||
temperature: float = 0.0
|
||||
stop_strings: list[str] = []
|
||||
|
||||
use_kv_cache: bool = True
|
||||
max_new_tokens: int = 4096
|
||||
|
||||
@property
|
||||
def repo_cache_folder(self) -> str:
|
||||
return self.repo_id.replace("/", "--")
|
||||
|
||||
|
||||
class ApiVlmOptions(BaseVlmOptions):
|
||||
kind: Literal["api_model_options"] = "api_model_options"
|
||||
|
||||
url: AnyUrl = AnyUrl(
|
||||
"http://localhost:11434/v1/chat/completions"
|
||||
) # Default to ollama
|
||||
headers: Dict[str, str] = {}
|
||||
params: Dict[str, Any] = {}
|
||||
scale: float = 2.0
|
||||
timeout: float = 60
|
||||
concurrency: int = 1
|
||||
response_format: ResponseFormat
|
@ -5,7 +5,7 @@ from pydantic import (
|
||||
AnyUrl,
|
||||
)
|
||||
|
||||
from docling.datamodel.pipeline_options import (
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
InferenceFramework,
|
||||
|
@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import ApiVlmOptions
|
||||
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
|
||||
from docling.exceptions import OperationNotAllowed
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.api_image_request import api_image_request
|
||||
|
@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
|
@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
|
@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToke
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
@ -4,7 +4,6 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union, cast
|
||||
|
||||
# from docling_core.types import DoclingDocument
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
DocItem,
|
||||
@ -28,11 +27,13 @@ from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat, Page
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import (
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
InferenceFramework,
|
||||
ResponseFormat,
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.api_vlm_model import ApiVlmModel
|
||||
@ -42,8 +43,6 @@ from docling.models.vlm_models_inline.hf_transformers_causallm_model import (
|
||||
from docling.models.vlm_models_inline.hf_transformers_vision2seq_model import (
|
||||
HuggingFaceVlmModel_AutoModelForVision2Seq,
|
||||
)
|
||||
|
||||
# from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||
from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
45
docs/examples/minimal_vlm_pipeline.py
Normal file
45
docs/examples/minimal_vlm_pipeline.py
Normal file
@ -0,0 +1,45 @@
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_vlm_model_spec import SMOLDOCLING_MLX
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
source = "https://arxiv.org/pdf/2501.17887"
|
||||
|
||||
###### USING SIMPLE DEFAULT VALUES
|
||||
# - SmolDocling model
|
||||
# - Using the transformers framework
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline,
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
doc = converter.convert(source=source).document
|
||||
|
||||
print(doc.export_to_markdown())
|
||||
|
||||
|
||||
###### USING MACOS MPS ACCELERATOR
|
||||
|
||||
pipeline_options = VlmPipelineOptions(
|
||||
vlm_options=SMOLDOCLING_MLX,
|
||||
)
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
doc = converter.convert(source=source).document
|
||||
|
||||
print(doc.export_to_markdown())
|
@ -7,10 +7,9 @@ from dotenv import load_dotenv
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
ApiVlmOptions,
|
||||
ResponseFormat,
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
|
@ -78,6 +78,7 @@ nav:
|
||||
- "Multi-format conversion": examples/run_with_formats.py
|
||||
- "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
|
||||
- "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
|
||||
- "VLM comparison": examples/compare_vlm_models.py
|
||||
- "Figure export": examples/export_figures.py
|
||||
- "Table export": examples/export_tables.py
|
||||
- "Multimodal export": examples/export_multimodal.py
|
||||
|
Loading…
Reference in New Issue
Block a user