mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
feat: add a new PictureDescription Model to support llama-stack API
Introduces a new PictureDescription Api model to support llama-stack API. It creates the picture_description_llama_stack_api_model.py which is a variant of the existing picture_description_api_model.py but with the necessary changes to accommodate the request and response payloads used by the llama-stack chat-completion API specification. Signed-off-by: Rafael T. C. Soares <rafaelcba@gmail.com>
This commit is contained in:
parent
c605edd8e9
commit
f33fe7dbf0
@ -230,6 +230,18 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
||||
provenance: str = ""
|
||||
|
||||
|
||||
class PictureDescriptionLlamaStackApiOptions(PictureDescriptionBaseOptions):
|
||||
kind: ClassVar[Literal["llama-stack"]] = "llama-stack"
|
||||
|
||||
url: AnyUrl = AnyUrl("http://localhost:8321/v1/inference/chat-completion")
|
||||
headers: Dict[str, str] = {}
|
||||
params: Dict[str, Any] = {}
|
||||
timeout: float = 20
|
||||
|
||||
prompt: str = "Describe this image in a few sentences."
|
||||
provenance: str = ""
|
||||
|
||||
|
||||
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
||||
kind: ClassVar[Literal["vlm"]] = "vlm"
|
||||
|
||||
|
130
docling/models/picture_description_llama_stack_api_model.py
Normal file
130
docling/models/picture_description_llama_stack_api_model.py
Normal file
@ -0,0 +1,130 @@
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Type, Union
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorOptions,
|
||||
PictureDescriptionLlamaStackApiOptions,
|
||||
PictureDescriptionBaseOptions,
|
||||
)
|
||||
from docling.exceptions import OperationNotAllowed
|
||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ToolCall(BaseModel):
|
||||
call_id: str
|
||||
tool_name: str
|
||||
arguments: str
|
||||
arguments_json: Optional[str]
|
||||
|
||||
|
||||
class CompletionMessage(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
stop_reason: str
|
||||
tool_calls: Optional[List[ToolCall]]
|
||||
|
||||
|
||||
class Metric(BaseModel):
|
||||
metric: str
|
||||
unit: Optional[str]
|
||||
value: int
|
||||
|
||||
|
||||
class LogProbs(BaseModel):
|
||||
logprobs_by_token: dict[str, int]
|
||||
|
||||
|
||||
class ApiResponse(BaseModel):
|
||||
model_config = ConfigDict(
|
||||
protected_namespaces=(),
|
||||
)
|
||||
|
||||
completion_message: CompletionMessage
|
||||
logprobs: Optional[LogProbs] = None
|
||||
metrics: List[Metric] = []
|
||||
|
||||
|
||||
class PictureDescriptionLlamaStackApiModel(PictureDescriptionBaseModel):
|
||||
# elements_batch_size = 4
|
||||
|
||||
@classmethod
|
||||
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
||||
return PictureDescriptionLlamaStackApiOptions
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
enable_remote_services: bool,
|
||||
artifacts_path: Optional[Union[Path, str]],
|
||||
options: PictureDescriptionLlamaStackApiOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
super().__init__(
|
||||
enabled=enabled,
|
||||
enable_remote_services=enable_remote_services,
|
||||
artifacts_path=artifacts_path,
|
||||
options=options,
|
||||
accelerator_options=accelerator_options,
|
||||
)
|
||||
self.options: PictureDescriptionLlamaStackApiOptions
|
||||
|
||||
if self.enabled:
|
||||
if not enable_remote_services:
|
||||
raise OperationNotAllowed(
|
||||
"Connections to remote services is only allowed when set explicitly. "
|
||||
"pipeline_options.enable_remote_services=True."
|
||||
)
|
||||
|
||||
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||
# Note: technically we could make a batch request here,
|
||||
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
||||
for image in images:
|
||||
img_io = io.BytesIO()
|
||||
image.save(img_io, "PNG")
|
||||
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": self.options.prompt,
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"image": {
|
||||
"data": image_base64
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
payload = {
|
||||
"messages": messages,
|
||||
**self.options.params,
|
||||
}
|
||||
|
||||
r = requests.post(
|
||||
str(self.options.url),
|
||||
headers=self.options.headers,
|
||||
json=payload,
|
||||
timeout=self.options.timeout,
|
||||
)
|
||||
if not r.ok:
|
||||
_log.error(f"Error calling the API. Reponse was {r.text}")
|
||||
r.raise_for_status()
|
||||
|
||||
api_resp = ApiResponse.model_validate_json(r.text)
|
||||
generated_text = api_resp.completion_message.content.strip()
|
||||
yield generated_text
|
@ -1,6 +1,9 @@
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.ocr_mac_model import OcrMacModel
|
||||
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
||||
from docling.models.picture_description_llama_stack_api_model import (
|
||||
PictureDescriptionLlamaStackApiModel,
|
||||
)
|
||||
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
@ -24,5 +27,6 @@ def picture_description():
|
||||
"picture_description": [
|
||||
PictureDescriptionVlmModel,
|
||||
PictureDescriptionApiModel,
|
||||
PictureDescriptionLlamaStackApiModel,
|
||||
]
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
PictureDescriptionApiOptions,
|
||||
PictureDescriptionLlamaStackApiOptions
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
@ -22,7 +23,19 @@ def vllm_local_options(model: str):
|
||||
seed=42,
|
||||
max_completion_tokens=200,
|
||||
),
|
||||
prompt="Describe the image in three sentences. Be consise and accurate.",
|
||||
prompt="Describe the image in three sentences. Be concise and accurate.",
|
||||
timeout=90,
|
||||
)
|
||||
return options
|
||||
|
||||
|
||||
def llama_stack_local_options(model: str):
|
||||
options = PictureDescriptionLlamaStackApiOptions(
|
||||
url="http://localhost:8321/v1/inference/chat-completion",
|
||||
params=dict(
|
||||
model_id=model,
|
||||
),
|
||||
prompt="Describe the image in three sentences. Be concise and accurate.",
|
||||
timeout=90,
|
||||
)
|
||||
return options
|
||||
@ -76,11 +89,14 @@ def main():
|
||||
|
||||
# The PictureDescriptionApiOptions() allows to interface with APIs supporting
|
||||
# the multi-modal chat interface. Here follow a few example on how to configure those.
|
||||
#
|
||||
#
|
||||
# One possibility is self-hosting model, e.g. via VLLM.
|
||||
# $ vllm serve MODEL_NAME
|
||||
# Then PictureDescriptionApiOptions can point to the localhost endpoint.
|
||||
#
|
||||
# The PictureDescriptionLlamaStackApiOptions() allows to interface with visual models registered with llama-stack via its chat-completion API.
|
||||
# If you have a local llama-stack instance locally via Docker or Podman, you can point the 'url' to its endpoint.
|
||||
#
|
||||
# Example for the Granite Vision model: (uncomment the following lines)
|
||||
# pipeline_options.picture_description_options = vllm_local_options(
|
||||
# model="ibm-granite/granite-vision-3.1-2b-preview"
|
||||
|
@ -176,7 +176,7 @@ pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
|
||||
|
||||
The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g.
|
||||
on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others,
|
||||
or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc.
|
||||
or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc. The `PictureDescriptionLlamaStackApiOptions` class is a variant which allows to use visual models models through llama-stack server.
|
||||
|
||||
_Note: in most cases this option will send your data to the remote service provider._
|
||||
|
||||
|
@ -112,6 +112,7 @@ _Note: This option is only related to the system sending user data to remote ser
|
||||
The options in this list require the explicit `enable_remote_services=True` when processing the documents.
|
||||
|
||||
- `PictureDescriptionApiOptions`: Using vision models via API calls.
|
||||
- `PictureDescriptionLlamaStackApiOptions`: Using vision models via llama-stack Server API calls.
|
||||
|
||||
|
||||
#### Adjust pipeline features
|
||||
|
Loading…
Reference in New Issue
Block a user