mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
feat: add a new PictureDescription Model to support llama-stack API
Introduces a new PictureDescription Api model to support llama-stack API. It creates the picture_description_llama_stack_api_model.py which is a variant of the existing picture_description_api_model.py but with the necessary changes to accommodate the request and response payloads used by the llama-stack chat-completion API specification. Signed-off-by: Rafael T. C. Soares <rafaelcba@gmail.com>
This commit is contained in:
parent
c605edd8e9
commit
f33fe7dbf0
@ -230,6 +230,18 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
|||||||
provenance: str = ""
|
provenance: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class PictureDescriptionLlamaStackApiOptions(PictureDescriptionBaseOptions):
|
||||||
|
kind: ClassVar[Literal["llama-stack"]] = "llama-stack"
|
||||||
|
|
||||||
|
url: AnyUrl = AnyUrl("http://localhost:8321/v1/inference/chat-completion")
|
||||||
|
headers: Dict[str, str] = {}
|
||||||
|
params: Dict[str, Any] = {}
|
||||||
|
timeout: float = 20
|
||||||
|
|
||||||
|
prompt: str = "Describe this image in a few sentences."
|
||||||
|
provenance: str = ""
|
||||||
|
|
||||||
|
|
||||||
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
||||||
kind: ClassVar[Literal["vlm"]] = "vlm"
|
kind: ClassVar[Literal["vlm"]] = "vlm"
|
||||||
|
|
||||||
|
130
docling/models/picture_description_llama_stack_api_model.py
Normal file
130
docling/models/picture_description_llama_stack_api_model.py
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional, Type, Union
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from PIL import Image
|
||||||
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AcceleratorOptions,
|
||||||
|
PictureDescriptionLlamaStackApiOptions,
|
||||||
|
PictureDescriptionBaseOptions,
|
||||||
|
)
|
||||||
|
from docling.exceptions import OperationNotAllowed
|
||||||
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ToolCall(BaseModel):
|
||||||
|
call_id: str
|
||||||
|
tool_name: str
|
||||||
|
arguments: str
|
||||||
|
arguments_json: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
class CompletionMessage(BaseModel):
|
||||||
|
role: str
|
||||||
|
content: str
|
||||||
|
stop_reason: str
|
||||||
|
tool_calls: Optional[List[ToolCall]]
|
||||||
|
|
||||||
|
|
||||||
|
class Metric(BaseModel):
|
||||||
|
metric: str
|
||||||
|
unit: Optional[str]
|
||||||
|
value: int
|
||||||
|
|
||||||
|
|
||||||
|
class LogProbs(BaseModel):
|
||||||
|
logprobs_by_token: dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
|
class ApiResponse(BaseModel):
|
||||||
|
model_config = ConfigDict(
|
||||||
|
protected_namespaces=(),
|
||||||
|
)
|
||||||
|
|
||||||
|
completion_message: CompletionMessage
|
||||||
|
logprobs: Optional[LogProbs] = None
|
||||||
|
metrics: List[Metric] = []
|
||||||
|
|
||||||
|
|
||||||
|
class PictureDescriptionLlamaStackApiModel(PictureDescriptionBaseModel):
|
||||||
|
# elements_batch_size = 4
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
||||||
|
return PictureDescriptionLlamaStackApiOptions
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
enabled: bool,
|
||||||
|
enable_remote_services: bool,
|
||||||
|
artifacts_path: Optional[Union[Path, str]],
|
||||||
|
options: PictureDescriptionLlamaStackApiOptions,
|
||||||
|
accelerator_options: AcceleratorOptions,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
enabled=enabled,
|
||||||
|
enable_remote_services=enable_remote_services,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
options=options,
|
||||||
|
accelerator_options=accelerator_options,
|
||||||
|
)
|
||||||
|
self.options: PictureDescriptionLlamaStackApiOptions
|
||||||
|
|
||||||
|
if self.enabled:
|
||||||
|
if not enable_remote_services:
|
||||||
|
raise OperationNotAllowed(
|
||||||
|
"Connections to remote services is only allowed when set explicitly. "
|
||||||
|
"pipeline_options.enable_remote_services=True."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||||
|
# Note: technically we could make a batch request here,
|
||||||
|
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
||||||
|
for image in images:
|
||||||
|
img_io = io.BytesIO()
|
||||||
|
image.save(img_io, "PNG")
|
||||||
|
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": self.options.prompt,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"image": {
|
||||||
|
"data": image_base64
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"messages": messages,
|
||||||
|
**self.options.params,
|
||||||
|
}
|
||||||
|
|
||||||
|
r = requests.post(
|
||||||
|
str(self.options.url),
|
||||||
|
headers=self.options.headers,
|
||||||
|
json=payload,
|
||||||
|
timeout=self.options.timeout,
|
||||||
|
)
|
||||||
|
if not r.ok:
|
||||||
|
_log.error(f"Error calling the API. Reponse was {r.text}")
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
api_resp = ApiResponse.model_validate_json(r.text)
|
||||||
|
generated_text = api_resp.completion_message.content.strip()
|
||||||
|
yield generated_text
|
@ -1,6 +1,9 @@
|
|||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.ocr_mac_model import OcrMacModel
|
from docling.models.ocr_mac_model import OcrMacModel
|
||||||
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
||||||
|
from docling.models.picture_description_llama_stack_api_model import (
|
||||||
|
PictureDescriptionLlamaStackApiModel,
|
||||||
|
)
|
||||||
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
||||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
from docling.models.rapid_ocr_model import RapidOcrModel
|
||||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||||
@ -24,5 +27,6 @@ def picture_description():
|
|||||||
"picture_description": [
|
"picture_description": [
|
||||||
PictureDescriptionVlmModel,
|
PictureDescriptionVlmModel,
|
||||||
PictureDescriptionApiModel,
|
PictureDescriptionApiModel,
|
||||||
|
PictureDescriptionLlamaStackApiModel,
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ from docling.datamodel.base_models import InputFormat
|
|||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
PictureDescriptionApiOptions,
|
PictureDescriptionApiOptions,
|
||||||
|
PictureDescriptionLlamaStackApiOptions
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
@ -22,7 +23,19 @@ def vllm_local_options(model: str):
|
|||||||
seed=42,
|
seed=42,
|
||||||
max_completion_tokens=200,
|
max_completion_tokens=200,
|
||||||
),
|
),
|
||||||
prompt="Describe the image in three sentences. Be consise and accurate.",
|
prompt="Describe the image in three sentences. Be concise and accurate.",
|
||||||
|
timeout=90,
|
||||||
|
)
|
||||||
|
return options
|
||||||
|
|
||||||
|
|
||||||
|
def llama_stack_local_options(model: str):
|
||||||
|
options = PictureDescriptionLlamaStackApiOptions(
|
||||||
|
url="http://localhost:8321/v1/inference/chat-completion",
|
||||||
|
params=dict(
|
||||||
|
model_id=model,
|
||||||
|
),
|
||||||
|
prompt="Describe the image in three sentences. Be concise and accurate.",
|
||||||
timeout=90,
|
timeout=90,
|
||||||
)
|
)
|
||||||
return options
|
return options
|
||||||
@ -81,6 +94,9 @@ def main():
|
|||||||
# $ vllm serve MODEL_NAME
|
# $ vllm serve MODEL_NAME
|
||||||
# Then PictureDescriptionApiOptions can point to the localhost endpoint.
|
# Then PictureDescriptionApiOptions can point to the localhost endpoint.
|
||||||
#
|
#
|
||||||
|
# The PictureDescriptionLlamaStackApiOptions() allows to interface with visual models registered with llama-stack via its chat-completion API.
|
||||||
|
# If you have a local llama-stack instance locally via Docker or Podman, you can point the 'url' to its endpoint.
|
||||||
|
#
|
||||||
# Example for the Granite Vision model: (uncomment the following lines)
|
# Example for the Granite Vision model: (uncomment the following lines)
|
||||||
# pipeline_options.picture_description_options = vllm_local_options(
|
# pipeline_options.picture_description_options = vllm_local_options(
|
||||||
# model="ibm-granite/granite-vision-3.1-2b-preview"
|
# model="ibm-granite/granite-vision-3.1-2b-preview"
|
||||||
|
@ -176,7 +176,7 @@ pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
|
|||||||
|
|
||||||
The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g.
|
The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g.
|
||||||
on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others,
|
on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others,
|
||||||
or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc.
|
or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc. The `PictureDescriptionLlamaStackApiOptions` class is a variant which allows to use visual models models through llama-stack server.
|
||||||
|
|
||||||
_Note: in most cases this option will send your data to the remote service provider._
|
_Note: in most cases this option will send your data to the remote service provider._
|
||||||
|
|
||||||
|
@ -112,6 +112,7 @@ _Note: This option is only related to the system sending user data to remote ser
|
|||||||
The options in this list require the explicit `enable_remote_services=True` when processing the documents.
|
The options in this list require the explicit `enable_remote_services=True` when processing the documents.
|
||||||
|
|
||||||
- `PictureDescriptionApiOptions`: Using vision models via API calls.
|
- `PictureDescriptionApiOptions`: Using vision models via API calls.
|
||||||
|
- `PictureDescriptionLlamaStackApiOptions`: Using vision models via llama-stack Server API calls.
|
||||||
|
|
||||||
|
|
||||||
#### Adjust pipeline features
|
#### Adjust pipeline features
|
||||||
|
Loading…
Reference in New Issue
Block a user