mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
working TwoStageVlmModel
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
b2336830eb
commit
fb74d0c5b3
@ -39,6 +39,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
|
|||||||
InferenceFramework,
|
InferenceFramework,
|
||||||
InlineVlmOptions,
|
InlineVlmOptions,
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
|
TwoStageVlmOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.vlm_model_specs import (
|
from docling.datamodel.vlm_model_specs import (
|
||||||
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
||||||
@ -276,7 +277,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|||||||
False # (To be used with vlms, or other generative models)
|
False # (To be used with vlms, or other generative models)
|
||||||
)
|
)
|
||||||
# If True, text from backend will be used instead of generated text
|
# If True, text from backend will be used instead of generated text
|
||||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
vlm_options: Union[InlineVlmOptions, ApiVlmOptions, TwoStageVlmOptions] = (
|
||||||
smoldocling_vlm_conversion_options
|
smoldocling_vlm_conversion_options
|
||||||
# SMOLDOCLING_TRANSFORMERS
|
# SMOLDOCLING_TRANSFORMERS
|
||||||
)
|
)
|
||||||
@ -294,9 +295,6 @@ class AsrPipelineOptions(PipelineOptions):
|
|||||||
artifacts_path: Optional[Union[Path, str]] = None
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PdfPipelineOptions(PaginatedPipelineOptions):
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||||
"""Options for the PDF pipeline."""
|
"""Options for the PDF pipeline."""
|
||||||
|
|
||||||
|
@ -6,6 +6,9 @@ from pydantic import AnyUrl, BaseModel
|
|||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||||
|
from docling.datamodel.layout_model_specs import (
|
||||||
|
LayoutModelConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class BaseVlmOptions(BaseModel):
|
class BaseVlmOptions(BaseModel):
|
||||||
@ -88,12 +91,11 @@ class ApiVlmOptions(BaseVlmOptions):
|
|||||||
concurrency: int = 1
|
concurrency: int = 1
|
||||||
response_format: ResponseFormat
|
response_format: ResponseFormat
|
||||||
|
|
||||||
from docling.datamodel.layout_model_specs import (
|
|
||||||
LayoutModelConfig,
|
|
||||||
)
|
|
||||||
|
|
||||||
class TwoStageVlmOptions(BaseModel):
|
class TwoStageVlmOptions(BaseModel):
|
||||||
kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
|
kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
|
||||||
|
|
||||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS
|
response_format: ResponseFormat # final response of the VLM
|
||||||
|
|
||||||
layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2
|
layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2
|
||||||
|
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS
|
||||||
|
@ -143,7 +143,9 @@ GEMMA3_27B_MLX = InlineVlmOptions(
|
|||||||
)
|
)
|
||||||
|
|
||||||
VLM2STAGE = TwoStageVlmOptions(
|
VLM2STAGE = TwoStageVlmOptions(
|
||||||
vlm_options=SMOLDOCLING_MLX, layout_options=DOCLING_LAYOUT_HERON
|
vlm_options=SMOLDOCLING_MLX,
|
||||||
|
layout_options=DOCLING_LAYOUT_HERON,
|
||||||
|
response_format=SMOLDOCLING_MLX.response_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat, Page
|
from docling.datamodel.base_models import InputFormat, Page
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import VlmPipelineOptions
|
from docling.datamodel.pipeline_options import LayoutOptions, VlmPipelineOptions
|
||||||
from docling.datamodel.pipeline_options_vlm_model import (
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
ApiVlmOptions,
|
ApiVlmOptions,
|
||||||
InferenceFramework,
|
InferenceFramework,
|
||||||
@ -115,38 +115,47 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
TwoStageVlmOptions, self.pipeline_options.vlm_options
|
TwoStageVlmOptions, self.pipeline_options.vlm_options
|
||||||
)
|
)
|
||||||
|
|
||||||
layout_options = twostagevlm_options.layout_options
|
stage_1_options = twostagevlm_options.layout_options
|
||||||
vlm_options = twostagevlm_options.vlm_options
|
stage_2_options = twostagevlm_options.vlm_options
|
||||||
|
|
||||||
layout_model = LayoutModel(
|
layout_model = LayoutModel(
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
options=layout_options,
|
options=LayoutOptions(
|
||||||
|
create_orphan_clusters=False, model_spec=stage_1_options
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
if vlm_options.inference_framework == InferenceFramework.MLX:
|
if (
|
||||||
|
isinstance(stage_2_options, InlineVlmOptions)
|
||||||
|
and stage_2_options.inference_framework == InferenceFramework.MLX
|
||||||
|
):
|
||||||
vlm_model_mlx = HuggingFaceMlxModel(
|
vlm_model_mlx = HuggingFaceMlxModel(
|
||||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
vlm_options=vlm_options,
|
vlm_options=stage_2_options,
|
||||||
)
|
)
|
||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx)
|
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx)
|
||||||
]
|
]
|
||||||
elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
|
elif (
|
||||||
|
isinstance(stage_2_options, InlineVlmOptions)
|
||||||
|
and stage_2_options.inference_framework
|
||||||
|
== InferenceFramework.TRANSFORMERS
|
||||||
|
):
|
||||||
vlm_model_hf = HuggingFaceTransformersVlmModel(
|
vlm_model_hf = HuggingFaceTransformersVlmModel(
|
||||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
vlm_options=vlm_options,
|
vlm_options=stage_2_options,
|
||||||
)
|
)
|
||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf)
|
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf)
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
|
f"Could not instantiate the right type of VLM pipeline: {stage_2_options}"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.enrichment_pipe = [
|
self.enrichment_pipe = [
|
||||||
|
Loading…
Reference in New Issue
Block a user