feat: add a new PictureDescription Model to support llama-stack API

Introduces a new PictureDescription Api model to support llama-stack API. It creates the picture_description_llama_stack_api_model.py which is a variant of the existing picture_description_api_model.py but with the necessary changes to accommodate the request and response payloads used by the llama-stack chat-completion API specification. Signed-off-by: Rafael T. C. Soares <rafaelcba@gmail.com>
2025-07-27 12:34:22 +00:00 · 2025-04-09 14:46:00 -05:00 · 2025-04-09 14:46:00 -05:00 · f33fe7dbf0
commit f33fe7dbf0
parent c605edd8e9
6 changed files with 166 additions and 3 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -230,6 +230,18 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
    provenance: str = ""
 class PictureDescriptionLlamaStackApiOptions(PictureDescriptionBaseOptions):
    kind: ClassVar[Literal["llama-stack"]] = "llama-stack"
    url: AnyUrl = AnyUrl("http://localhost:8321/v1/inference/chat-completion")
    headers: Dict[str, str] = {}
    params: Dict[str, Any] = {}
    timeout: float = 20
    prompt: str = "Describe this image in a few sentences."
    provenance: str = ""
 class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
    kind: ClassVar[Literal["vlm"]] = "vlm"
--- a/docling/models/picture_description_llama_stack_api_model.py
+++ b/docling/models/picture_description_llama_stack_api_model.py
@ -0,0 +1,130 @@
 import base64
 import io
 import logging
 from pathlib import Path
 from typing import Iterable, List, Optional, Type, Union
 import requests
 from PIL import Image
 from pydantic import BaseModel, ConfigDict
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    PictureDescriptionLlamaStackApiOptions,
    PictureDescriptionBaseOptions,
 )
 from docling.exceptions import OperationNotAllowed
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel
 _log = logging.getLogger(__name__)
 class ToolCall(BaseModel):
    call_id: str
    tool_name: str
    arguments: str
    arguments_json: Optional[str]
 class CompletionMessage(BaseModel):
    role: str
    content: str
    stop_reason: str
    tool_calls: Optional[List[ToolCall]]
 class Metric(BaseModel):
    metric: str
    unit: Optional[str]
    value: int
 class LogProbs(BaseModel):
    logprobs_by_token: dict[str, int]
 class ApiResponse(BaseModel):
    model_config = ConfigDict(
        protected_namespaces=(),
    )
    completion_message: CompletionMessage
    logprobs: Optional[LogProbs] = None
    metrics: List[Metric] = []
 class PictureDescriptionLlamaStackApiModel(PictureDescriptionBaseModel):
    # elements_batch_size = 4
    @classmethod
    def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
        return PictureDescriptionLlamaStackApiOptions
    def __init__(
        self,
        enabled: bool,
        enable_remote_services: bool,
        artifacts_path: Optional[Union[Path, str]],
        options: PictureDescriptionLlamaStackApiOptions,
        accelerator_options: AcceleratorOptions,
    ):
        super().__init__(
            enabled=enabled,
            enable_remote_services=enable_remote_services,
            artifacts_path=artifacts_path,
            options=options,
            accelerator_options=accelerator_options,
        )
        self.options: PictureDescriptionLlamaStackApiOptions
        if self.enabled:
            if not enable_remote_services:
                raise OperationNotAllowed(
                    "Connections to remote services is only allowed when set explicitly. "
                    "pipeline_options.enable_remote_services=True."
                )
    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
        # Note: technically we could make a batch request here,
        # but not all APIs will allow for it. For example, vllm won't allow more than 1.
        for image in images:
            img_io = io.BytesIO()
            image.save(img_io, "PNG")
            image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
            messages = [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": self.options.prompt,
                        },
                        {
                            "type": "image",
                            "image": {
                                "data": image_base64
                            },
                        },
                    ],
                }
            ]
            payload = {
                "messages": messages,
                **self.options.params,
            }
            r = requests.post(
                str(self.options.url),
                headers=self.options.headers,
                json=payload,
                timeout=self.options.timeout,
            )
            if not r.ok:
                _log.error(f"Error calling the API. Reponse was {r.text}")
            r.raise_for_status()
            api_resp = ApiResponse.model_validate_json(r.text)
            generated_text = api_resp.completion_message.content.strip()
            yield generated_text
--- a/docling/models/plugins/defaults.py
+++ b/docling/models/plugins/defaults.py
@ -1,6 +1,9 @@
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.ocr_mac_model import OcrMacModel
 from docling.models.picture_description_api_model import PictureDescriptionApiModel
 from docling.models.picture_description_llama_stack_api_model import (
    PictureDescriptionLlamaStackApiModel,
 )
 from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.rapid_ocr_model import RapidOcrModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
@ -24,5 +27,6 @@ def picture_description():
        "picture_description": [
            PictureDescriptionVlmModel,
            PictureDescriptionApiModel,
            PictureDescriptionLlamaStackApiModel,
        ]
    }
--- a/docs/examples/pictures_description_api.py
+++ b/docs/examples/pictures_description_api.py
@ -10,6 +10,7 @@ from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    PictureDescriptionApiOptions,
    PictureDescriptionLlamaStackApiOptions
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
@ -22,7 +23,19 @@ def vllm_local_options(model: str):
            seed=42,
            max_completion_tokens=200,
        ),
-        prompt="Describe the image in three sentences. Be consise and accurate.",
+        prompt="Describe the image in three sentences. Be concise and accurate.",
        timeout=90,
    )
    return options
 def llama_stack_local_options(model: str):
    options = PictureDescriptionLlamaStackApiOptions(
        url="http://localhost:8321/v1/inference/chat-completion",
        params=dict(
            model_id=model,
        ),
        prompt="Describe the image in three sentences. Be concise and accurate.",
        timeout=90,
    )
    return options
@ -81,6 +94,9 @@ def main():
    # $ vllm serve MODEL_NAME
    # Then PictureDescriptionApiOptions can point to the localhost endpoint.
    #
    # The PictureDescriptionLlamaStackApiOptions() allows to interface with visual models registered with llama-stack via its chat-completion API.
    # If you have a local llama-stack instance locally via Docker or Podman, you can point the 'url' to its endpoint.
    #
    # Example for the Granite Vision model: (uncomment the following lines)
    # pipeline_options.picture_description_options = vllm_local_options(
    #     model="ibm-granite/granite-vision-3.1-2b-preview"
--- a/docs/usage/enrichments.md
+++ b/docs/usage/enrichments.md
@ -176,7 +176,7 @@ pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
 The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g.
 on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others,
-or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc.
+or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc. The `PictureDescriptionLlamaStackApiOptions` class is a variant which allows to use visual models models through llama-stack server. 
 _Note: in most cases this option will send your data to the remote service provider._
--- a/docs/usage/index.md
+++ b/docs/usage/index.md
@ -112,6 +112,7 @@ _Note: This option is only related to the system sending user data to remote ser
 The options in this list require the explicit `enable_remote_services=True` when processing the documents.
 - `PictureDescriptionApiOptions`: Using vision models via API calls.
 - `PictureDescriptionLlamaStackApiOptions`: Using vision models via llama-stack Server API calls.
 #### Adjust pipeline features