diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 9791a251..6c99b5a8 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -230,6 +230,18 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions): provenance: str = "" +class PictureDescriptionLlamaStackApiOptions(PictureDescriptionBaseOptions): + kind: ClassVar[Literal["llama-stack"]] = "llama-stack" + + url: AnyUrl = AnyUrl("http://localhost:8321/v1/inference/chat-completion") + headers: Dict[str, str] = {} + params: Dict[str, Any] = {} + timeout: float = 20 + + prompt: str = "Describe this image in a few sentences." + provenance: str = "" + + class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions): kind: ClassVar[Literal["vlm"]] = "vlm" diff --git a/docling/models/picture_description_llama_stack_api_model.py b/docling/models/picture_description_llama_stack_api_model.py new file mode 100644 index 00000000..9d6d2f03 --- /dev/null +++ b/docling/models/picture_description_llama_stack_api_model.py @@ -0,0 +1,130 @@ +import base64 +import io +import logging +from pathlib import Path +from typing import Iterable, List, Optional, Type, Union + +import requests +from PIL import Image +from pydantic import BaseModel, ConfigDict + +from docling.datamodel.pipeline_options import ( + AcceleratorOptions, + PictureDescriptionLlamaStackApiOptions, + PictureDescriptionBaseOptions, +) +from docling.exceptions import OperationNotAllowed +from docling.models.picture_description_base_model import PictureDescriptionBaseModel + +_log = logging.getLogger(__name__) + + +class ToolCall(BaseModel): + call_id: str + tool_name: str + arguments: str + arguments_json: Optional[str] + + +class CompletionMessage(BaseModel): + role: str + content: str + stop_reason: str + tool_calls: Optional[List[ToolCall]] + + +class Metric(BaseModel): + metric: str + unit: Optional[str] + value: int + + +class LogProbs(BaseModel): + logprobs_by_token: dict[str, int] + + +class ApiResponse(BaseModel): + model_config = ConfigDict( + protected_namespaces=(), + ) + + completion_message: CompletionMessage + logprobs: Optional[LogProbs] = None + metrics: List[Metric] = [] + + +class PictureDescriptionLlamaStackApiModel(PictureDescriptionBaseModel): + # elements_batch_size = 4 + + @classmethod + def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]: + return PictureDescriptionLlamaStackApiOptions + + def __init__( + self, + enabled: bool, + enable_remote_services: bool, + artifacts_path: Optional[Union[Path, str]], + options: PictureDescriptionLlamaStackApiOptions, + accelerator_options: AcceleratorOptions, + ): + super().__init__( + enabled=enabled, + enable_remote_services=enable_remote_services, + artifacts_path=artifacts_path, + options=options, + accelerator_options=accelerator_options, + ) + self.options: PictureDescriptionLlamaStackApiOptions + + if self.enabled: + if not enable_remote_services: + raise OperationNotAllowed( + "Connections to remote services is only allowed when set explicitly. " + "pipeline_options.enable_remote_services=True." + ) + + def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: + # Note: technically we could make a batch request here, + # but not all APIs will allow for it. For example, vllm won't allow more than 1. + for image in images: + img_io = io.BytesIO() + image.save(img_io, "PNG") + image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8") + + messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": self.options.prompt, + }, + { + "type": "image", + "image": { + "data": image_base64 + }, + }, + ], + } + ] + + payload = { + "messages": messages, + **self.options.params, + } + + r = requests.post( + str(self.options.url), + headers=self.options.headers, + json=payload, + timeout=self.options.timeout, + ) + if not r.ok: + _log.error(f"Error calling the API. Reponse was {r.text}") + r.raise_for_status() + + api_resp = ApiResponse.model_validate_json(r.text) + generated_text = api_resp.completion_message.content.strip() + yield generated_text diff --git a/docling/models/plugins/defaults.py b/docling/models/plugins/defaults.py index 00873579..98da0d54 100644 --- a/docling/models/plugins/defaults.py +++ b/docling/models/plugins/defaults.py @@ -1,6 +1,9 @@ from docling.models.easyocr_model import EasyOcrModel from docling.models.ocr_mac_model import OcrMacModel from docling.models.picture_description_api_model import PictureDescriptionApiModel +from docling.models.picture_description_llama_stack_api_model import ( + PictureDescriptionLlamaStackApiModel, +) from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel from docling.models.rapid_ocr_model import RapidOcrModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel @@ -24,5 +27,6 @@ def picture_description(): "picture_description": [ PictureDescriptionVlmModel, PictureDescriptionApiModel, + PictureDescriptionLlamaStackApiModel, ] } diff --git a/docs/examples/pictures_description_api.py b/docs/examples/pictures_description_api.py index 8e105d24..f373f9fa 100644 --- a/docs/examples/pictures_description_api.py +++ b/docs/examples/pictures_description_api.py @@ -10,6 +10,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( PdfPipelineOptions, PictureDescriptionApiOptions, + PictureDescriptionLlamaStackApiOptions ) from docling.document_converter import DocumentConverter, PdfFormatOption @@ -22,7 +23,19 @@ def vllm_local_options(model: str): seed=42, max_completion_tokens=200, ), - prompt="Describe the image in three sentences. Be consise and accurate.", + prompt="Describe the image in three sentences. Be concise and accurate.", + timeout=90, + ) + return options + + +def llama_stack_local_options(model: str): + options = PictureDescriptionLlamaStackApiOptions( + url="http://localhost:8321/v1/inference/chat-completion", + params=dict( + model_id=model, + ), + prompt="Describe the image in three sentences. Be concise and accurate.", timeout=90, ) return options @@ -76,11 +89,14 @@ def main(): # The PictureDescriptionApiOptions() allows to interface with APIs supporting # the multi-modal chat interface. Here follow a few example on how to configure those. - # + # # One possibility is self-hosting model, e.g. via VLLM. # $ vllm serve MODEL_NAME # Then PictureDescriptionApiOptions can point to the localhost endpoint. # + # The PictureDescriptionLlamaStackApiOptions() allows to interface with visual models registered with llama-stack via its chat-completion API. + # If you have a local llama-stack instance locally via Docker or Podman, you can point the 'url' to its endpoint. + # # Example for the Granite Vision model: (uncomment the following lines) # pipeline_options.picture_description_options = vllm_local_options( # model="ibm-granite/granite-vision-3.1-2b-preview" diff --git a/docs/usage/enrichments.md b/docs/usage/enrichments.md index bec9ea69..723348f2 100644 --- a/docs/usage/enrichments.md +++ b/docs/usage/enrichments.md @@ -176,7 +176,7 @@ pipeline_options.picture_description_options = PictureDescriptionVlmOptions( The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g. on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others, -or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc. +or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc. The `PictureDescriptionLlamaStackApiOptions` class is a variant which allows to use visual models models through llama-stack server. _Note: in most cases this option will send your data to the remote service provider._ diff --git a/docs/usage/index.md b/docs/usage/index.md index acf33976..de4c52f6 100644 --- a/docs/usage/index.md +++ b/docs/usage/index.md @@ -112,6 +112,7 @@ _Note: This option is only related to the system sending user data to remote ser The options in this list require the explicit `enable_remote_services=True` when processing the documents. - `PictureDescriptionApiOptions`: Using vision models via API calls. +- `PictureDescriptionLlamaStackApiOptions`: Using vision models via llama-stack Server API calls. #### Adjust pipeline features