diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 40947fd9..4fb2885e 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -39,6 +39,7 @@ from docling.datamodel.pipeline_options_vlm_model import ( InferenceFramework, InlineVlmOptions, ResponseFormat, + TwoStageVlmOptions, ) from docling.datamodel.vlm_model_specs import ( GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options, @@ -276,9 +277,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions): False # (To be used with vlms, or other generative models) ) # If True, text from backend will be used instead of generated text - vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = ( + vlm_options: Union[InlineVlmOptions, ApiVlmOptions, TwoStageVlmOptions] = ( smoldocling_vlm_conversion_options - #SMOLDOCLING_TRANSFORMERS + # SMOLDOCLING_TRANSFORMERS ) @@ -294,9 +295,6 @@ class AsrPipelineOptions(PipelineOptions): artifacts_path: Optional[Union[Path, str]] = None - - - class PdfPipelineOptions(PaginatedPipelineOptions): """Options for the PDF pipeline.""" diff --git a/docling/datamodel/pipeline_options_asr_model.py b/docling/datamodel/pipeline_options_asr_model.py index 12109ad1..f26aad76 100644 --- a/docling/datamodel/pipeline_options_asr_model.py +++ b/docling/datamodel/pipeline_options_asr_model.py @@ -7,8 +7,8 @@ from typing_extensions import deprecated from docling.datamodel.accelerator_options import AcceleratorDevice # from docling.datamodel.pipeline_options_vlm_model import ( - # InferenceFramework, - # TransformersModelType, +# InferenceFramework, +# TransformersModelType, # ) diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py index 3cf2efb0..66c97ca4 100644 --- a/docling/datamodel/pipeline_options_vlm_model.py +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -6,6 +6,9 @@ from pydantic import AnyUrl, BaseModel from typing_extensions import deprecated from docling.datamodel.accelerator_options import AcceleratorDevice +from docling.datamodel.layout_model_specs import ( + LayoutModelConfig, +) class BaseVlmOptions(BaseModel): @@ -88,12 +91,11 @@ class ApiVlmOptions(BaseVlmOptions): concurrency: int = 1 response_format: ResponseFormat -from docling.datamodel.layout_model_specs import ( - LayoutModelConfig, -) - + class TwoStageVlmOptions(BaseModel): kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options" - vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS - layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2 + response_format: ResponseFormat # final response of the VLM + + layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2 + vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py index 25815a92..8025d02f 100644 --- a/docling/datamodel/vlm_model_specs.py +++ b/docling/datamodel/vlm_model_specs.py @@ -143,7 +143,9 @@ GEMMA3_27B_MLX = InlineVlmOptions( ) VLM2STAGE = TwoStageVlmOptions( - vlm_options=SMOLDOCLING_MLX, layout_options=DOCLING_LAYOUT_HERON + vlm_options=SMOLDOCLING_MLX, + layout_options=DOCLING_LAYOUT_HERON, + response_format=SMOLDOCLING_MLX.response_format, ) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 0ee06efb..1c94d977 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -26,7 +26,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import VlmPipelineOptions +from docling.datamodel.pipeline_options import LayoutOptions, VlmPipelineOptions from docling.datamodel.pipeline_options_vlm_model import ( ApiVlmOptions, InferenceFramework, @@ -115,38 +115,47 @@ class VlmPipeline(PaginatedPipeline): TwoStageVlmOptions, self.pipeline_options.vlm_options ) - layout_options = twostagevlm_options.layout_options - vlm_options = twostagevlm_options.vlm_options + stage_1_options = twostagevlm_options.layout_options + stage_2_options = twostagevlm_options.vlm_options layout_model = LayoutModel( artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, - options=layout_options, + options=LayoutOptions( + create_orphan_clusters=False, model_spec=stage_1_options + ), ) - if vlm_options.inference_framework == InferenceFramework.MLX: + if ( + isinstance(stage_2_options, InlineVlmOptions) + and stage_2_options.inference_framework == InferenceFramework.MLX + ): vlm_model_mlx = HuggingFaceMlxModel( enabled=True, # must be always enabled for this pipeline to make sense. artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, - vlm_options=vlm_options, + vlm_options=stage_2_options, ) self.build_pipe = [ TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx) ] - elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS: + elif ( + isinstance(stage_2_options, InlineVlmOptions) + and stage_2_options.inference_framework + == InferenceFramework.TRANSFORMERS + ): vlm_model_hf = HuggingFaceTransformersVlmModel( enabled=True, # must be always enabled for this pipeline to make sense. artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, - vlm_options=vlm_options, + vlm_options=stage_2_options, ) self.build_pipe = [ TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf) ] else: raise ValueError( - f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}" + f"Could not instantiate the right type of VLM pipeline: {stage_2_options}" ) self.enrichment_pipe = [