refactor: Move OpenAI API call logic into utils.utils

This will allow reuse of this logic in a generic VLM model NOTE: There is a subtle change here in the ordering of the text prompt and the image in the call to the OpenAI API. When run against Ollama, this ordering makes a big difference. If the prompt comes before the image, the result is terse and not usable whereas the prompt coming after the image works as expected and matches the non-OpenAI chat API. Branch: OllamaVlmModel Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-04-09 08:02:01 -06:00 · 2025-04-09 08:02:01 -06:00 · ad1541e8cf
commit ad1541e8cf
parent 8ef0b897c8
3 changed files with 100 additions and 75 deletions
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -262,3 +262,34 @@ class Page(BaseModel):
    @property
    def image(self) -> Optional[Image]:
        return self.get_image(scale=self._default_image_scale)
 ## OpenAI API Request / Response Models ##
 class OpenAiChatMessage(BaseModel):
    role: str
    content: str
 class OpenAiResponseChoice(BaseModel):
    index: int
    message: OpenAiChatMessage
    finish_reason: str
 class OpenAiResponseUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
 class OpenAiApiResponse(BaseModel):
    model_config = ConfigDict(
        protected_namespaces=(),
    )
    id: str
    model: Optional[str] = None  # returned by openai
    choices: List[OpenAiResponseChoice]
    created: int
    usage: OpenAiResponseUsage
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@ -1,12 +1,7 @@
 import base64
 import io
 import logging
 from pathlib import Path
-from typing import Iterable, List, Optional, Type, Union
+from typing import Iterable, Optional, Type, Union
 import requests
 from PIL import Image
 from pydantic import BaseModel, ConfigDict
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
@ -15,37 +10,7 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.exceptions import OperationNotAllowed
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel
-
+from docling.utils.utils import openai_image_request
 _log = logging.getLogger(__name__)
 class ChatMessage(BaseModel):
    role: str
    content: str
 class ResponseChoice(BaseModel):
    index: int
    message: ChatMessage
    finish_reason: str
 class ResponseUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
 class ApiResponse(BaseModel):
    model_config = ConfigDict(
        protected_namespaces=(),
    )
    id: str
    model: Optional[str] = None  # returned by openai
    choices: List[ResponseChoice]
    created: int
    usage: ResponseUsage
 class PictureDescriptionApiModel(PictureDescriptionBaseModel):
@ -83,43 +48,11 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
        # Note: technically we could make a batch request here,
        # but not all APIs will allow for it. For example, vllm won't allow more than 1.
        for image in images:
-            img_io = io.BytesIO()
+            yield openai_image_request(
-            image.save(img_io, "PNG")
+                image=image,
-            image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+                prompt=self.options.prompt,
-
+                url=self.options.url,
-            messages = [
+                timeout=self.options.headers,
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": self.options.prompt,
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{image_base64}"
                            },
                        },
                    ],
                }
            ]
            payload = {
                "messages": messages,
                **self.options.params,
            }
            r = requests.post(
                str(self.options.url),
                headers=self.options.headers,
-                json=payload,
+                **self.options.params,
                timeout=self.options.timeout,
            )
            if not r.ok:
                _log.error(f"Error calling the API. Reponse was {r.text}")
            r.raise_for_status()
            api_resp = ApiResponse.model_validate_json(r.text)
            generated_text = api_resp.choices[0].message.content.strip()
            yield generated_text
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@ -1,12 +1,19 @@
 import base64
 import hashlib
 import logging
 from io import BytesIO
 from itertools import islice
 from pathlib import Path
 from typing import List, Union
 import requests
 from PIL import Image
 from tqdm import tqdm
 from docling.datamodel.base_models import OpenAiApiResponse
 _log = logging.getLogger(__name__)
 def chunkify(iterator, chunk_size):
    """Yield successive chunks of chunk_size from the iterable."""
@ -63,3 +70,57 @@ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
    buf.seek(0)
    return buf
 def openai_image_request(
    image: Image.Image,
    prompt: str,
    url: str = "http://localhost:11434/v1/chat/completions",  # Default to ollama
    apikey: str | None = None,
    timeout: float = 20,
    headers: dict[str, str] | None = None,
    **params,
 ) -> str:
    img_io = BytesIO()
    image.save(img_io, "PNG")
    image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_base64}"
                    },
                },
                {
                    "type": "text",
                    "text": prompt,
                },
            ],
        }
    ]
    payload = {
        "messages": messages,
        **params,
    }
    headers = headers or {}
    if apikey is not None:
        headers["Authorization"] = f"Bearer {apikey}"
    r = requests.post(
        str(url),
        headers=headers,
        json=payload,
        timeout=timeout,
    )
    if not r.ok:
        _log.error(f"Error calling the API. Response was {r.text}")
    r.raise_for_status()
    api_resp = OpenAiApiResponse.model_validate_json(r.text)
    generated_text = api_resp.choices[0].message.content.strip()
    return generated_text