update vlm API

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-02-06 13:51:41 +01:00
parent 06342a5a28
commit 8ac000e35e
3 changed files with 97 additions and 46 deletions

View File

@ -197,12 +197,12 @@ class PicDescBaseOptions(BaseModel):
class PicDescApiOptions(PicDescBaseOptions): class PicDescApiOptions(PicDescBaseOptions):
kind: Literal["api"] = "api" kind: Literal["api"] = "api"
url: AnyUrl = AnyUrl("http://localhost/") url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
headers: Dict[str, str] = {} headers: Dict[str, str] = {}
params: Dict[str, Any] = {} params: Dict[str, Any] = {}
timeout: float = 20 timeout: float = 20
llm_prompt: str = "" prompt: str = "Describe this image in a few sentences."
provenance: str = "" provenance: str = ""

View File

@ -1,13 +1,14 @@
import base64 import base64
import io import io
import logging import logging
from typing import List, Optional from typing import Iterable, List, Optional
import httpx import httpx
from docling_core.types.doc import PictureItem from docling_core.types.doc import PictureItem
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
PictureDescriptionData, PictureDescriptionData,
) )
from PIL import Image
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict
from docling.datamodel.pipeline_options import PicDescApiOptions from docling.datamodel.pipeline_options import PicDescApiOptions
@ -39,62 +40,60 @@ class ApiResponse(BaseModel):
) )
id: str id: str
model: Optional[str] = None # returned bu openai model: Optional[str] = None # returned by openai
choices: List[ResponseChoice] choices: List[ResponseChoice]
created: int created: int
usage: ResponseUsage usage: ResponseUsage
class PictureDescriptionApiModel(PictureDescriptionBaseModel): class PictureDescriptionApiModel(PictureDescriptionBaseModel):
# elements_batch_size = 4
def __init__(self, enabled: bool, options: PicDescApiOptions): def __init__(self, enabled: bool, options: PicDescApiOptions):
super().__init__(enabled=enabled, options=options) super().__init__(enabled=enabled, options=options)
self.options: PicDescApiOptions self.options: PicDescApiOptions
def _annotate_image(self, picture: PictureItem) -> PictureDescriptionData: def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
assert picture.image is not None # Note: technically we could make a batch request here,
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
for image in images:
img_io = io.BytesIO()
image.save(img_io, "PNG")
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
img_io = io.BytesIO() messages = [
assert picture.image.pil_image is not None {
picture.image.pil_image.save(img_io, "PNG") "role": "user",
"content": [
{
"type": "text",
"text": self.options.prompt,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
],
}
]
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8") payload = {
"messages": messages,
messages = [ **self.options.params,
{
"role": "user",
"content": [
{
"type": "text",
"text": self.options.llm_prompt,
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
},
],
} }
]
payload = { r = httpx.post(
"messages": messages, str(self.options.url),
**self.options.params, headers=self.options.headers,
} json=payload,
timeout=self.options.timeout,
)
if not r.is_success:
_log.error(f"Error calling the API. Reponse was {r.text}")
r.raise_for_status()
r = httpx.post( api_resp = ApiResponse.model_validate_json(r.text)
str(self.options.url), generated_text = api_resp.choices[0].message.content.strip()
headers=self.options.headers, yield generated_text
json=payload,
timeout=self.options.timeout,
)
if not r.is_success:
_log.error(f"Error calling the API. Reponse was {r.text}")
r.raise_for_status()
api_resp = ApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip()
return PictureDescriptionData(
provenance=self.options.provenance,
text=generated_text,
)

View File

@ -0,0 +1,52 @@
import logging
from pathlib import Path
from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, PicDescApiOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/2206.01062.pdf")
# This is using a local API server to do picture description.
# For example, you can launch it locally with:
# $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct"
pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = PicDescApiOptions(
url="http://localhost:8000/v1/chat/completions",
params=dict(
model="HuggingFaceTB/SmolVLM-256M-Instruct",
seed=42,
max_completion_tokens=200,
),
prompt="Describe the image in three sentences. Be consise and accurate.",
timeout=90,
)
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
result = doc_converter.convert(input_doc_path)
for element, _level in result.document.iterate_items():
if isinstance(element, PictureItem):
print(
f"Picture {element.self_ref}\n"
f"Caption: {element.caption_text(doc=result.document)}\n"
f"Annotations: {element.annotations}"
)
if __name__ == "__main__":
main()