update vlm API

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-02-06 13:51:41 +01:00
parent 06342a5a28
commit 8ac000e35e
3 changed files with 97 additions and 46 deletions

View File

@ -197,12 +197,12 @@ class PicDescBaseOptions(BaseModel):
class PicDescApiOptions(PicDescBaseOptions):
kind: Literal["api"] = "api"
url: AnyUrl = AnyUrl("http://localhost/")
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
headers: Dict[str, str] = {}
params: Dict[str, Any] = {}
timeout: float = 20
llm_prompt: str = ""
prompt: str = "Describe this image in a few sentences."
provenance: str = ""

View File

@ -1,13 +1,14 @@
import base64
import io
import logging
from typing import List, Optional
from typing import Iterable, List, Optional
import httpx
from docling_core.types.doc import PictureItem
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
PictureDescriptionData,
)
from PIL import Image
from pydantic import BaseModel, ConfigDict
from docling.datamodel.pipeline_options import PicDescApiOptions
@ -39,25 +40,25 @@ class ApiResponse(BaseModel):
)
id: str
model: Optional[str] = None # returned bu openai
model: Optional[str] = None # returned by openai
choices: List[ResponseChoice]
created: int
usage: ResponseUsage
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
# elements_batch_size = 4
def __init__(self, enabled: bool, options: PicDescApiOptions):
super().__init__(enabled=enabled, options=options)
self.options: PicDescApiOptions
def _annotate_image(self, picture: PictureItem) -> PictureDescriptionData:
assert picture.image is not None
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
# Note: technically we could make a batch request here,
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
for image in images:
img_io = io.BytesIO()
assert picture.image.pil_image is not None
picture.image.pil_image.save(img_io, "PNG")
image.save(img_io, "PNG")
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
messages = [
@ -66,11 +67,13 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
"content": [
{
"type": "text",
"text": self.options.llm_prompt,
"text": self.options.prompt,
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
],
}
@ -93,8 +96,4 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
api_resp = ApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip()
return PictureDescriptionData(
provenance=self.options.provenance,
text=generated_text,
)
yield generated_text

View File

@ -0,0 +1,52 @@
import logging
from pathlib import Path
from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, PicDescApiOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/2206.01062.pdf")
# This is using a local API server to do picture description.
# For example, you can launch it locally with:
# $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct"
pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = PicDescApiOptions(
url="http://localhost:8000/v1/chat/completions",
params=dict(
model="HuggingFaceTB/SmolVLM-256M-Instruct",
seed=42,
max_completion_tokens=200,
),
prompt="Describe the image in three sentences. Be consise and accurate.",
timeout=90,
)
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
result = doc_converter.convert(input_doc_path)
for element, _level in result.document.iterate_items():
if isinstance(element, PictureItem):
print(
f"Picture {element.self_ref}\n"
f"Caption: {element.caption_text(doc=result.document)}\n"
f"Annotations: {element.annotations}"
)
if __name__ == "__main__":
main()