mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
add new minimal_vlm example and refactor pipeline_options_vlm_model for cleaner import
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
f63312add6
commit
2bd15cc809
@ -17,6 +17,12 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
|
|||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
# Import the following for backwards compatibility
|
# Import the following for backwards compatibility
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
|
ApiVlmOptions,
|
||||||
|
HuggingFaceVlmOptions,
|
||||||
|
InferenceFramework,
|
||||||
|
ResponseFormat,
|
||||||
|
)
|
||||||
from docling.datamodel.pipeline_vlm_model_spec import (
|
from docling.datamodel.pipeline_vlm_model_spec import (
|
||||||
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
||||||
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
||||||
@ -305,62 +311,6 @@ class PaginatedPipelineOptions(PipelineOptions):
|
|||||||
generate_picture_images: bool = False
|
generate_picture_images: bool = False
|
||||||
|
|
||||||
|
|
||||||
class BaseVlmOptions(BaseModel):
|
|
||||||
kind: str
|
|
||||||
prompt: str
|
|
||||||
|
|
||||||
|
|
||||||
class ResponseFormat(str, Enum):
|
|
||||||
DOCTAGS = "doctags"
|
|
||||||
MARKDOWN = "markdown"
|
|
||||||
HTML = "html"
|
|
||||||
|
|
||||||
|
|
||||||
class InferenceFramework(str, Enum):
|
|
||||||
MLX = "mlx"
|
|
||||||
TRANSFORMERS = "transformers"
|
|
||||||
TRANSFORMERS_VISION2SEQ = "transformers-vision2seq"
|
|
||||||
TRANSFORMERS_CAUSALLM = "transformers-causallm"
|
|
||||||
|
|
||||||
|
|
||||||
class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
||||||
kind: Literal["hf_model_options"] = "hf_model_options"
|
|
||||||
|
|
||||||
repo_id: str
|
|
||||||
load_in_8bit: bool = True
|
|
||||||
llm_int8_threshold: float = 6.0
|
|
||||||
quantized: bool = False
|
|
||||||
|
|
||||||
inference_framework: InferenceFramework
|
|
||||||
response_format: ResponseFormat
|
|
||||||
|
|
||||||
scale: float = 2.0
|
|
||||||
|
|
||||||
temperature: float = 0.0
|
|
||||||
stop_strings: list[str] = []
|
|
||||||
|
|
||||||
use_kv_cache: bool = True
|
|
||||||
max_new_tokens: int = 4096
|
|
||||||
|
|
||||||
@property
|
|
||||||
def repo_cache_folder(self) -> str:
|
|
||||||
return self.repo_id.replace("/", "--")
|
|
||||||
|
|
||||||
|
|
||||||
class ApiVlmOptions(BaseVlmOptions):
|
|
||||||
kind: Literal["api_model_options"] = "api_model_options"
|
|
||||||
|
|
||||||
url: AnyUrl = AnyUrl(
|
|
||||||
"http://localhost:11434/v1/chat/completions"
|
|
||||||
) # Default to ollama
|
|
||||||
headers: Dict[str, str] = {}
|
|
||||||
params: Dict[str, Any] = {}
|
|
||||||
scale: float = 2.0
|
|
||||||
timeout: float = 60
|
|
||||||
concurrency: int = 1
|
|
||||||
response_format: ResponseFormat
|
|
||||||
|
|
||||||
|
|
||||||
class VlmPipelineOptions(PaginatedPipelineOptions):
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||||
generate_page_images: bool = True
|
generate_page_images: bool = True
|
||||||
force_backend_text: bool = (
|
force_backend_text: bool = (
|
||||||
|
60
docling/datamodel/pipeline_options_vlm_model.py
Normal file
60
docling/datamodel/pipeline_options_vlm_model.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Dict, Literal
|
||||||
|
|
||||||
|
from pydantic import AnyUrl, BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class BaseVlmOptions(BaseModel):
|
||||||
|
kind: str
|
||||||
|
prompt: str
|
||||||
|
|
||||||
|
|
||||||
|
class ResponseFormat(str, Enum):
|
||||||
|
DOCTAGS = "doctags"
|
||||||
|
MARKDOWN = "markdown"
|
||||||
|
HTML = "html"
|
||||||
|
|
||||||
|
|
||||||
|
class InferenceFramework(str, Enum):
|
||||||
|
MLX = "mlx"
|
||||||
|
TRANSFORMERS = "transformers"
|
||||||
|
TRANSFORMERS_VISION2SEQ = "transformers-vision2seq"
|
||||||
|
TRANSFORMERS_CAUSALLM = "transformers-causallm"
|
||||||
|
|
||||||
|
|
||||||
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
||||||
|
kind: Literal["hf_model_options"] = "hf_model_options"
|
||||||
|
|
||||||
|
repo_id: str
|
||||||
|
load_in_8bit: bool = True
|
||||||
|
llm_int8_threshold: float = 6.0
|
||||||
|
quantized: bool = False
|
||||||
|
|
||||||
|
inference_framework: InferenceFramework
|
||||||
|
response_format: ResponseFormat
|
||||||
|
|
||||||
|
scale: float = 2.0
|
||||||
|
|
||||||
|
temperature: float = 0.0
|
||||||
|
stop_strings: list[str] = []
|
||||||
|
|
||||||
|
use_kv_cache: bool = True
|
||||||
|
max_new_tokens: int = 4096
|
||||||
|
|
||||||
|
@property
|
||||||
|
def repo_cache_folder(self) -> str:
|
||||||
|
return self.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
|
class ApiVlmOptions(BaseVlmOptions):
|
||||||
|
kind: Literal["api_model_options"] = "api_model_options"
|
||||||
|
|
||||||
|
url: AnyUrl = AnyUrl(
|
||||||
|
"http://localhost:11434/v1/chat/completions"
|
||||||
|
) # Default to ollama
|
||||||
|
headers: Dict[str, str] = {}
|
||||||
|
params: Dict[str, Any] = {}
|
||||||
|
scale: float = 2.0
|
||||||
|
timeout: float = 60
|
||||||
|
concurrency: int = 1
|
||||||
|
response_format: ResponseFormat
|
@ -5,7 +5,7 @@ from pydantic import (
|
|||||||
AnyUrl,
|
AnyUrl,
|
||||||
)
|
)
|
||||||
|
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
ApiVlmOptions,
|
ApiVlmOptions,
|
||||||
HuggingFaceVlmOptions,
|
HuggingFaceVlmOptions,
|
||||||
InferenceFramework,
|
InferenceFramework,
|
||||||
|
@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import ApiVlmOptions
|
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
|
||||||
from docling.exceptions import OperationNotAllowed
|
from docling.exceptions import OperationNotAllowed
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.utils.api_image_request import api_image_request
|
from docling.utils.api_image_request import api_image_request
|
||||||
|
@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction
|
|||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
HuggingFaceVlmOptions,
|
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docling.utils.accelerator_utils import decide_device
|
||||||
|
@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction
|
|||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
HuggingFaceVlmOptions,
|
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docling.utils.accelerator_utils import decide_device
|
||||||
|
@ -8,8 +8,8 @@ from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToke
|
|||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
HuggingFaceVlmOptions,
|
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import HuggingFaceVlmOptions
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
@ -4,7 +4,6 @@ from io import BytesIO
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Union, cast
|
from typing import List, Optional, Union, cast
|
||||||
|
|
||||||
# from docling_core.types import DoclingDocument
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
BoundingBox,
|
BoundingBox,
|
||||||
DocItem,
|
DocItem,
|
||||||
@ -28,11 +27,13 @@ from docling.backend.pdf_backend import PdfDocumentBackend
|
|||||||
from docling.datamodel.base_models import InputFormat, Page
|
from docling.datamodel.base_models import InputFormat, Page
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
|
VlmPipelineOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
ApiVlmOptions,
|
ApiVlmOptions,
|
||||||
HuggingFaceVlmOptions,
|
HuggingFaceVlmOptions,
|
||||||
InferenceFramework,
|
InferenceFramework,
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
VlmPipelineOptions,
|
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.api_vlm_model import ApiVlmModel
|
from docling.models.api_vlm_model import ApiVlmModel
|
||||||
@ -42,8 +43,6 @@ from docling.models.vlm_models_inline.hf_transformers_causallm_model import (
|
|||||||
from docling.models.vlm_models_inline.hf_transformers_vision2seq_model import (
|
from docling.models.vlm_models_inline.hf_transformers_vision2seq_model import (
|
||||||
HuggingFaceVlmModel_AutoModelForVision2Seq,
|
HuggingFaceVlmModel_AutoModelForVision2Seq,
|
||||||
)
|
)
|
||||||
|
|
||||||
# from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
|
||||||
from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
|
from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
45
docs/examples/minimal_vlm_pipeline.py
Normal file
45
docs/examples/minimal_vlm_pipeline.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
VlmPipelineOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.pipeline_vlm_model_spec import SMOLDOCLING_MLX
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
|
source = "https://arxiv.org/pdf/2501.17887"
|
||||||
|
|
||||||
|
###### USING SIMPLE DEFAULT VALUES
|
||||||
|
# - SmolDocling model
|
||||||
|
# - Using the transformers framework
|
||||||
|
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = converter.convert(source=source).document
|
||||||
|
|
||||||
|
print(doc.export_to_markdown())
|
||||||
|
|
||||||
|
|
||||||
|
###### USING MACOS MPS ACCELERATOR
|
||||||
|
|
||||||
|
pipeline_options = VlmPipelineOptions(
|
||||||
|
vlm_options=SMOLDOCLING_MLX,
|
||||||
|
)
|
||||||
|
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = converter.convert(source=source).document
|
||||||
|
|
||||||
|
print(doc.export_to_markdown())
|
@ -7,10 +7,9 @@ from dotenv import load_dotenv
|
|||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
ApiVlmOptions,
|
|
||||||
ResponseFormat,
|
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
|
@ -78,6 +78,7 @@ nav:
|
|||||||
- "Multi-format conversion": examples/run_with_formats.py
|
- "Multi-format conversion": examples/run_with_formats.py
|
||||||
- "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
|
- "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
|
||||||
- "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
|
- "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
|
||||||
|
- "VLM comparison": examples/compare_vlm_models.py
|
||||||
- "Figure export": examples/export_figures.py
|
- "Figure export": examples/export_figures.py
|
||||||
- "Table export": examples/export_tables.py
|
- "Table export": examples/export_tables.py
|
||||||
- "Multimodal export": examples/export_multimodal.py
|
- "Multimodal export": examples/export_multimodal.py
|
||||||
|
Loading…
Reference in New Issue
Block a user