working TwoStageVlmModel

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2025-07-10 15:11:53 +02:00
parent b2336830eb
commit fb74d0c5b3
5 changed files with 34 additions and 23 deletions

View File

@ -39,6 +39,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
InferenceFramework, InferenceFramework,
InlineVlmOptions, InlineVlmOptions,
ResponseFormat, ResponseFormat,
TwoStageVlmOptions,
) )
from docling.datamodel.vlm_model_specs import ( from docling.datamodel.vlm_model_specs import (
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options, GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
@ -276,7 +277,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
False # (To be used with vlms, or other generative models) False # (To be used with vlms, or other generative models)
) )
# If True, text from backend will be used instead of generated text # If True, text from backend will be used instead of generated text
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = ( vlm_options: Union[InlineVlmOptions, ApiVlmOptions, TwoStageVlmOptions] = (
smoldocling_vlm_conversion_options smoldocling_vlm_conversion_options
# SMOLDOCLING_TRANSFORMERS # SMOLDOCLING_TRANSFORMERS
) )
@ -294,9 +295,6 @@ class AsrPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None artifacts_path: Optional[Union[Path, str]] = None
class PdfPipelineOptions(PaginatedPipelineOptions): class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline.""" """Options for the PDF pipeline."""

View File

@ -6,6 +6,9 @@ from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
from docling.datamodel.accelerator_options import AcceleratorDevice from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.layout_model_specs import (
LayoutModelConfig,
)
class BaseVlmOptions(BaseModel): class BaseVlmOptions(BaseModel):
@ -88,12 +91,11 @@ class ApiVlmOptions(BaseVlmOptions):
concurrency: int = 1 concurrency: int = 1
response_format: ResponseFormat response_format: ResponseFormat
from docling.datamodel.layout_model_specs import (
LayoutModelConfig,
)
class TwoStageVlmOptions(BaseModel): class TwoStageVlmOptions(BaseModel):
kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options" kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS response_format: ResponseFormat # final response of the VLM
layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2 layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS

View File

@ -143,7 +143,9 @@ GEMMA3_27B_MLX = InlineVlmOptions(
) )
VLM2STAGE = TwoStageVlmOptions( VLM2STAGE = TwoStageVlmOptions(
vlm_options=SMOLDOCLING_MLX, layout_options=DOCLING_LAYOUT_HERON vlm_options=SMOLDOCLING_MLX,
layout_options=DOCLING_LAYOUT_HERON,
response_format=SMOLDOCLING_MLX.response_format,
) )

View File

@ -26,7 +26,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.base_models import InputFormat, Page
from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import VlmPipelineOptions from docling.datamodel.pipeline_options import LayoutOptions, VlmPipelineOptions
from docling.datamodel.pipeline_options_vlm_model import ( from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions, ApiVlmOptions,
InferenceFramework, InferenceFramework,
@ -115,38 +115,47 @@ class VlmPipeline(PaginatedPipeline):
TwoStageVlmOptions, self.pipeline_options.vlm_options TwoStageVlmOptions, self.pipeline_options.vlm_options
) )
layout_options = twostagevlm_options.layout_options stage_1_options = twostagevlm_options.layout_options
vlm_options = twostagevlm_options.vlm_options stage_2_options = twostagevlm_options.vlm_options
layout_model = LayoutModel( layout_model = LayoutModel(
artifacts_path=artifacts_path, artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
options=layout_options, options=LayoutOptions(
create_orphan_clusters=False, model_spec=stage_1_options
),
) )
if vlm_options.inference_framework == InferenceFramework.MLX: if (
isinstance(stage_2_options, InlineVlmOptions)
and stage_2_options.inference_framework == InferenceFramework.MLX
):
vlm_model_mlx = HuggingFaceMlxModel( vlm_model_mlx = HuggingFaceMlxModel(
enabled=True, # must be always enabled for this pipeline to make sense. enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path, artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
vlm_options=vlm_options, vlm_options=stage_2_options,
) )
self.build_pipe = [ self.build_pipe = [
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx) TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx)
] ]
elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS: elif (
isinstance(stage_2_options, InlineVlmOptions)
and stage_2_options.inference_framework
== InferenceFramework.TRANSFORMERS
):
vlm_model_hf = HuggingFaceTransformersVlmModel( vlm_model_hf = HuggingFaceTransformersVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense. enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path, artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
vlm_options=vlm_options, vlm_options=stage_2_options,
) )
self.build_pipe = [ self.build_pipe = [
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf) TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf)
] ]
else: else:
raise ValueError( raise ValueError(
f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}" f"Could not instantiate the right type of VLM pipeline: {stage_2_options}"
) )
self.enrichment_pipe = [ self.enrichment_pipe = [