feat: add a new PictureDescription Model to support llama-stack API

Introduces a new PictureDescription Api model to support llama-stack API. It creates the picture_description_llama_stack_api_model.py which is a variant of the existing picture_description_api_model.py but with the necessary changes to accommodate the request and response payloads used by the llama-stack chat-completion API specification. Signed-off-by: Rafael T. C. Soares <rafaelcba@gmail.com>
2025-07-27 04:24:45 +00:00 · 2025-04-09 14:46:00 -05:00 · 2025-04-09 14:46:00 -05:00 · f33fe7dbf0
commit f33fe7dbf0
parent c605edd8e9
6 changed files with 166 additions and 3 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -230,6 +230,18 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
    provenance: str = ""


+class PictureDescriptionLlamaStackApiOptions(PictureDescriptionBaseOptions):
+    kind: ClassVar[Literal["llama-stack"]] = "llama-stack"
+
+    url: AnyUrl = AnyUrl("http://localhost:8321/v1/inference/chat-completion")
+    headers: Dict[str, str] = {}
+    params: Dict[str, Any] = {}
+    timeout: float = 20
+
+    prompt: str = "Describe this image in a few sentences."
+    provenance: str = ""
+
+
 class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
    kind: ClassVar[Literal["vlm"]] = "vlm"

--- a/docling/models/picture_description_llama_stack_api_model.py
+++ b/docling/models/picture_description_llama_stack_api_model.py
@ -0,0 +1,130 @@
+import base64
+import io
+import logging
+from pathlib import Path
+from typing import Iterable, List, Optional, Type, Union
+
+import requests
+from PIL import Image
+from pydantic import BaseModel, ConfigDict
+
+from docling.datamodel.pipeline_options import (
+    AcceleratorOptions,
+    PictureDescriptionLlamaStackApiOptions,
+    PictureDescriptionBaseOptions,
+)
+from docling.exceptions import OperationNotAllowed
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+
+_log = logging.getLogger(__name__)
+
+
+class ToolCall(BaseModel):
+    call_id: str
+    tool_name: str
+    arguments: str
+    arguments_json: Optional[str]
+
+
+class CompletionMessage(BaseModel):
+    role: str
+    content: str
+    stop_reason: str
+    tool_calls: Optional[List[ToolCall]]
+
+
+class Metric(BaseModel):
+    metric: str
+    unit: Optional[str]
+    value: int
+
+
+class LogProbs(BaseModel):
+    logprobs_by_token: dict[str, int]
+
+
+class ApiResponse(BaseModel):
+    model_config = ConfigDict(
+        protected_namespaces=(),
+    )
+
+    completion_message: CompletionMessage
+    logprobs: Optional[LogProbs] = None
+    metrics: List[Metric] = []
+
+
+class PictureDescriptionLlamaStackApiModel(PictureDescriptionBaseModel):
+    # elements_batch_size = 4
+
+    @classmethod
+    def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
+        return PictureDescriptionLlamaStackApiOptions
+
+    def __init__(
+        self,
+        enabled: bool,
+        enable_remote_services: bool,
+        artifacts_path: Optional[Union[Path, str]],
+        options: PictureDescriptionLlamaStackApiOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            enable_remote_services=enable_remote_services,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
+        self.options: PictureDescriptionLlamaStackApiOptions
+
+        if self.enabled:
+            if not enable_remote_services:
+                raise OperationNotAllowed(
+                    "Connections to remote services is only allowed when set explicitly. "
+                    "pipeline_options.enable_remote_services=True."
+                )
+
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        # Note: technically we could make a batch request here,
+        # but not all APIs will allow for it. For example, vllm won't allow more than 1.
+        for image in images:
+            img_io = io.BytesIO()
+            image.save(img_io, "PNG")
+            image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": self.options.prompt,
+                        },
+                        {
+                            "type": "image",
+                            "image": {
+                                "data": image_base64
+                            },
+                        },
+                    ],
+                }
+            ]
+
+            payload = {
+                "messages": messages,
+                **self.options.params,
+            }
+
+            r = requests.post(
+                str(self.options.url),
+                headers=self.options.headers,
+                json=payload,
+                timeout=self.options.timeout,
+            )
+            if not r.ok:
+                _log.error(f"Error calling the API. Reponse was {r.text}")
+            r.raise_for_status()
+
+            api_resp = ApiResponse.model_validate_json(r.text)
+            generated_text = api_resp.completion_message.content.strip()
+            yield generated_text
--- a/docling/models/plugins/defaults.py
+++ b/docling/models/plugins/defaults.py
@ -1,6 +1,9 @@
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.ocr_mac_model import OcrMacModel
 from docling.models.picture_description_api_model import PictureDescriptionApiModel
+from docling.models.picture_description_llama_stack_api_model import (
+    PictureDescriptionLlamaStackApiModel,
+)
 from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.rapid_ocr_model import RapidOcrModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
@ -24,5 +27,6 @@ def picture_description():
        "picture_description": [
            PictureDescriptionVlmModel,
            PictureDescriptionApiModel,
+            PictureDescriptionLlamaStackApiModel,
        ]
    }
--- a/docs/examples/pictures_description_api.py
+++ b/docs/examples/pictures_description_api.py
@ -10,6 +10,7 @@ from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    PictureDescriptionApiOptions,
+    PictureDescriptionLlamaStackApiOptions
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption

@ -22,7 +23,19 @@ def vllm_local_options(model: str):
            seed=42,
            max_completion_tokens=200,
        ),
-        prompt="Describe the image in three sentences. Be consise and accurate.",
+        prompt="Describe the image in three sentences. Be concise and accurate.",
+        timeout=90,
+    )
+    return options
+
+
+def llama_stack_local_options(model: str):
+    options = PictureDescriptionLlamaStackApiOptions(
+        url="http://localhost:8321/v1/inference/chat-completion",
+        params=dict(
+            model_id=model,
+        ),
+        prompt="Describe the image in three sentences. Be concise and accurate.",
        timeout=90,
    )
    return options
@ -76,11 +89,14 @@ def main():

    # The PictureDescriptionApiOptions() allows to interface with APIs supporting
    # the multi-modal chat interface. Here follow a few example on how to configure those.
-    #
+    # 
    # One possibility is self-hosting model, e.g. via VLLM.
    # $ vllm serve MODEL_NAME
    # Then PictureDescriptionApiOptions can point to the localhost endpoint.
    #
+    # The PictureDescriptionLlamaStackApiOptions() allows to interface with visual models registered with llama-stack via its chat-completion API.
+    # If you have a local llama-stack instance locally via Docker or Podman, you can point the 'url' to its endpoint.
+    #
    # Example for the Granite Vision model: (uncomment the following lines)
    # pipeline_options.picture_description_options = vllm_local_options(
    #     model="ibm-granite/granite-vision-3.1-2b-preview"
--- a/docs/usage/enrichments.md
+++ b/docs/usage/enrichments.md
@ -176,7 +176,7 @@ pipeline_options.picture_description_options = PictureDescriptionVlmOptions(

 The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g.
 on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others,
-or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc.
+or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc. The `PictureDescriptionLlamaStackApiOptions` class is a variant which allows to use visual models models through llama-stack server. 

 _Note: in most cases this option will send your data to the remote service provider._

--- a/docs/usage/index.md
+++ b/docs/usage/index.md
@ -112,6 +112,7 @@ _Note: This option is only related to the system sending user data to remote ser
 The options in this list require the explicit `enable_remote_services=True` when processing the documents.

 - `PictureDescriptionApiOptions`: Using vision models via API calls.
+- `PictureDescriptionLlamaStackApiOptions`: Using vision models via llama-stack Server API calls.


 #### Adjust pipeline features