mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
working TwoStageVlmModel
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
b2336830eb
commit
fb74d0c5b3
@ -39,6 +39,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
|
||||
InferenceFramework,
|
||||
InlineVlmOptions,
|
||||
ResponseFormat,
|
||||
TwoStageVlmOptions,
|
||||
)
|
||||
from docling.datamodel.vlm_model_specs import (
|
||||
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
||||
@ -276,9 +277,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||
False # (To be used with vlms, or other generative models)
|
||||
)
|
||||
# If True, text from backend will be used instead of generated text
|
||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions, TwoStageVlmOptions] = (
|
||||
smoldocling_vlm_conversion_options
|
||||
#SMOLDOCLING_TRANSFORMERS
|
||||
# SMOLDOCLING_TRANSFORMERS
|
||||
)
|
||||
|
||||
|
||||
@ -294,9 +295,6 @@ class AsrPipelineOptions(PipelineOptions):
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
"""Options for the PDF pipeline."""
|
||||
|
||||
|
@ -7,8 +7,8 @@ from typing_extensions import deprecated
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
|
||||
# from docling.datamodel.pipeline_options_vlm_model import (
|
||||
# InferenceFramework,
|
||||
# TransformersModelType,
|
||||
# InferenceFramework,
|
||||
# TransformersModelType,
|
||||
# )
|
||||
|
||||
|
||||
|
@ -6,6 +6,9 @@ from pydantic import AnyUrl, BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
from docling.datamodel.layout_model_specs import (
|
||||
LayoutModelConfig,
|
||||
)
|
||||
|
||||
|
||||
class BaseVlmOptions(BaseModel):
|
||||
@ -88,12 +91,11 @@ class ApiVlmOptions(BaseVlmOptions):
|
||||
concurrency: int = 1
|
||||
response_format: ResponseFormat
|
||||
|
||||
from docling.datamodel.layout_model_specs import (
|
||||
LayoutModelConfig,
|
||||
)
|
||||
|
||||
|
||||
class TwoStageVlmOptions(BaseModel):
|
||||
kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
|
||||
|
||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS
|
||||
layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2
|
||||
response_format: ResponseFormat # final response of the VLM
|
||||
|
||||
layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2
|
||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS
|
||||
|
@ -143,7 +143,9 @@ GEMMA3_27B_MLX = InlineVlmOptions(
|
||||
)
|
||||
|
||||
VLM2STAGE = TwoStageVlmOptions(
|
||||
vlm_options=SMOLDOCLING_MLX, layout_options=DOCLING_LAYOUT_HERON
|
||||
vlm_options=SMOLDOCLING_MLX,
|
||||
layout_options=DOCLING_LAYOUT_HERON,
|
||||
response_format=SMOLDOCLING_MLX.response_format,
|
||||
)
|
||||
|
||||
|
||||
|
@ -26,7 +26,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat, Page
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import VlmPipelineOptions
|
||||
from docling.datamodel.pipeline_options import LayoutOptions, VlmPipelineOptions
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
InferenceFramework,
|
||||
@ -115,38 +115,47 @@ class VlmPipeline(PaginatedPipeline):
|
||||
TwoStageVlmOptions, self.pipeline_options.vlm_options
|
||||
)
|
||||
|
||||
layout_options = twostagevlm_options.layout_options
|
||||
vlm_options = twostagevlm_options.vlm_options
|
||||
stage_1_options = twostagevlm_options.layout_options
|
||||
stage_2_options = twostagevlm_options.vlm_options
|
||||
|
||||
layout_model = LayoutModel(
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
options=layout_options,
|
||||
options=LayoutOptions(
|
||||
create_orphan_clusters=False, model_spec=stage_1_options
|
||||
),
|
||||
)
|
||||
|
||||
if vlm_options.inference_framework == InferenceFramework.MLX:
|
||||
if (
|
||||
isinstance(stage_2_options, InlineVlmOptions)
|
||||
and stage_2_options.inference_framework == InferenceFramework.MLX
|
||||
):
|
||||
vlm_model_mlx = HuggingFaceMlxModel(
|
||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
vlm_options=vlm_options,
|
||||
vlm_options=stage_2_options,
|
||||
)
|
||||
self.build_pipe = [
|
||||
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx)
|
||||
]
|
||||
elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
|
||||
elif (
|
||||
isinstance(stage_2_options, InlineVlmOptions)
|
||||
and stage_2_options.inference_framework
|
||||
== InferenceFramework.TRANSFORMERS
|
||||
):
|
||||
vlm_model_hf = HuggingFaceTransformersVlmModel(
|
||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
vlm_options=vlm_options,
|
||||
vlm_options=stage_2_options,
|
||||
)
|
||||
self.build_pipe = [
|
||||
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf)
|
||||
]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
|
||||
f"Could not instantiate the right type of VLM pipeline: {stage_2_options}"
|
||||
)
|
||||
|
||||
self.enrichment_pipe = [
|
||||
|
Loading…
Reference in New Issue
Block a user