feat: add a new PictureDescription Model to support llama-stack API

Introduces a new PictureDescription Api model to support llama-stack API.
It creates the picture_description_llama_stack_api_model.py which is a variant of the existing picture_description_api_model.py
but with the necessary changes to accommodate the request and response payloads used by the llama-stack chat-completion API specification.

Signed-off-by: Rafael T. C. Soares <rafaelcba@gmail.com>
This commit is contained in:
Rafael T. C. Soares 2025-04-09 14:46:00 -05:00
parent c605edd8e9
commit f33fe7dbf0
6 changed files with 166 additions and 3 deletions

View File

@ -230,6 +230,18 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
provenance: str = "" provenance: str = ""
class PictureDescriptionLlamaStackApiOptions(PictureDescriptionBaseOptions):
kind: ClassVar[Literal["llama-stack"]] = "llama-stack"
url: AnyUrl = AnyUrl("http://localhost:8321/v1/inference/chat-completion")
headers: Dict[str, str] = {}
params: Dict[str, Any] = {}
timeout: float = 20
prompt: str = "Describe this image in a few sentences."
provenance: str = ""
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions): class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
kind: ClassVar[Literal["vlm"]] = "vlm" kind: ClassVar[Literal["vlm"]] = "vlm"

View File

@ -0,0 +1,130 @@
import base64
import io
import logging
from pathlib import Path
from typing import Iterable, List, Optional, Type, Union
import requests
from PIL import Image
from pydantic import BaseModel, ConfigDict
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
PictureDescriptionLlamaStackApiOptions,
PictureDescriptionBaseOptions,
)
from docling.exceptions import OperationNotAllowed
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
_log = logging.getLogger(__name__)
class ToolCall(BaseModel):
call_id: str
tool_name: str
arguments: str
arguments_json: Optional[str]
class CompletionMessage(BaseModel):
role: str
content: str
stop_reason: str
tool_calls: Optional[List[ToolCall]]
class Metric(BaseModel):
metric: str
unit: Optional[str]
value: int
class LogProbs(BaseModel):
logprobs_by_token: dict[str, int]
class ApiResponse(BaseModel):
model_config = ConfigDict(
protected_namespaces=(),
)
completion_message: CompletionMessage
logprobs: Optional[LogProbs] = None
metrics: List[Metric] = []
class PictureDescriptionLlamaStackApiModel(PictureDescriptionBaseModel):
# elements_batch_size = 4
@classmethod
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
return PictureDescriptionLlamaStackApiOptions
def __init__(
self,
enabled: bool,
enable_remote_services: bool,
artifacts_path: Optional[Union[Path, str]],
options: PictureDescriptionLlamaStackApiOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(
enabled=enabled,
enable_remote_services=enable_remote_services,
artifacts_path=artifacts_path,
options=options,
accelerator_options=accelerator_options,
)
self.options: PictureDescriptionLlamaStackApiOptions
if self.enabled:
if not enable_remote_services:
raise OperationNotAllowed(
"Connections to remote services is only allowed when set explicitly. "
"pipeline_options.enable_remote_services=True."
)
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
# Note: technically we could make a batch request here,
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
for image in images:
img_io = io.BytesIO()
image.save(img_io, "PNG")
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": self.options.prompt,
},
{
"type": "image",
"image": {
"data": image_base64
},
},
],
}
]
payload = {
"messages": messages,
**self.options.params,
}
r = requests.post(
str(self.options.url),
headers=self.options.headers,
json=payload,
timeout=self.options.timeout,
)
if not r.ok:
_log.error(f"Error calling the API. Reponse was {r.text}")
r.raise_for_status()
api_resp = ApiResponse.model_validate_json(r.text)
generated_text = api_resp.completion_message.content.strip()
yield generated_text

View File

@ -1,6 +1,9 @@
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.ocr_mac_model import OcrMacModel from docling.models.ocr_mac_model import OcrMacModel
from docling.models.picture_description_api_model import PictureDescriptionApiModel from docling.models.picture_description_api_model import PictureDescriptionApiModel
from docling.models.picture_description_llama_stack_api_model import (
PictureDescriptionLlamaStackApiModel,
)
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
from docling.models.rapid_ocr_model import RapidOcrModel from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
@ -24,5 +27,6 @@ def picture_description():
"picture_description": [ "picture_description": [
PictureDescriptionVlmModel, PictureDescriptionVlmModel,
PictureDescriptionApiModel, PictureDescriptionApiModel,
PictureDescriptionLlamaStackApiModel,
] ]
} }

View File

@ -10,6 +10,7 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
PdfPipelineOptions, PdfPipelineOptions,
PictureDescriptionApiOptions, PictureDescriptionApiOptions,
PictureDescriptionLlamaStackApiOptions
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
@ -22,7 +23,19 @@ def vllm_local_options(model: str):
seed=42, seed=42,
max_completion_tokens=200, max_completion_tokens=200,
), ),
prompt="Describe the image in three sentences. Be consise and accurate.", prompt="Describe the image in three sentences. Be concise and accurate.",
timeout=90,
)
return options
def llama_stack_local_options(model: str):
options = PictureDescriptionLlamaStackApiOptions(
url="http://localhost:8321/v1/inference/chat-completion",
params=dict(
model_id=model,
),
prompt="Describe the image in three sentences. Be concise and accurate.",
timeout=90, timeout=90,
) )
return options return options
@ -81,6 +94,9 @@ def main():
# $ vllm serve MODEL_NAME # $ vllm serve MODEL_NAME
# Then PictureDescriptionApiOptions can point to the localhost endpoint. # Then PictureDescriptionApiOptions can point to the localhost endpoint.
# #
# The PictureDescriptionLlamaStackApiOptions() allows to interface with visual models registered with llama-stack via its chat-completion API.
# If you have a local llama-stack instance locally via Docker or Podman, you can point the 'url' to its endpoint.
#
# Example for the Granite Vision model: (uncomment the following lines) # Example for the Granite Vision model: (uncomment the following lines)
# pipeline_options.picture_description_options = vllm_local_options( # pipeline_options.picture_description_options = vllm_local_options(
# model="ibm-granite/granite-vision-3.1-2b-preview" # model="ibm-granite/granite-vision-3.1-2b-preview"

View File

@ -176,7 +176,7 @@ pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g. The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g.
on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others, on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others,
or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc. or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc. The `PictureDescriptionLlamaStackApiOptions` class is a variant which allows to use visual models models through llama-stack server.
_Note: in most cases this option will send your data to the remote service provider._ _Note: in most cases this option will send your data to the remote service provider._

View File

@ -112,6 +112,7 @@ _Note: This option is only related to the system sending user data to remote ser
The options in this list require the explicit `enable_remote_services=True` when processing the documents. The options in this list require the explicit `enable_remote_services=True` when processing the documents.
- `PictureDescriptionApiOptions`: Using vision models via API calls. - `PictureDescriptionApiOptions`: Using vision models via API calls.
- `PictureDescriptionLlamaStackApiOptions`: Using vision models via llama-stack Server API calls.
#### Adjust pipeline features #### Adjust pipeline features