update vlm API

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-02-06 13:51:41 +01:00
parent 06342a5a28
commit 8ac000e35e
3 changed files with 97 additions and 46 deletions

View File

@ -197,12 +197,12 @@ class PicDescBaseOptions(BaseModel):
class PicDescApiOptions(PicDescBaseOptions): class PicDescApiOptions(PicDescBaseOptions):
kind: Literal["api"] = "api" kind: Literal["api"] = "api"
url: AnyUrl = AnyUrl("http://localhost/") url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
headers: Dict[str, str] = {} headers: Dict[str, str] = {}
params: Dict[str, Any] = {} params: Dict[str, Any] = {}
timeout: float = 20 timeout: float = 20
llm_prompt: str = "" prompt: str = "Describe this image in a few sentences."
provenance: str = "" provenance: str = ""

View File

@ -1,13 +1,14 @@
import base64 import base64
import io import io
import logging import logging
from typing import List, Optional from typing import Iterable, List, Optional
import httpx import httpx
from docling_core.types.doc import PictureItem from docling_core.types.doc import PictureItem
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
PictureDescriptionData, PictureDescriptionData,
) )
from PIL import Image
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict
from docling.datamodel.pipeline_options import PicDescApiOptions from docling.datamodel.pipeline_options import PicDescApiOptions
@ -39,25 +40,25 @@ class ApiResponse(BaseModel):
) )
id: str id: str
model: Optional[str] = None # returned bu openai model: Optional[str] = None # returned by openai
choices: List[ResponseChoice] choices: List[ResponseChoice]
created: int created: int
usage: ResponseUsage usage: ResponseUsage
class PictureDescriptionApiModel(PictureDescriptionBaseModel): class PictureDescriptionApiModel(PictureDescriptionBaseModel):
# elements_batch_size = 4
def __init__(self, enabled: bool, options: PicDescApiOptions): def __init__(self, enabled: bool, options: PicDescApiOptions):
super().__init__(enabled=enabled, options=options) super().__init__(enabled=enabled, options=options)
self.options: PicDescApiOptions self.options: PicDescApiOptions
def _annotate_image(self, picture: PictureItem) -> PictureDescriptionData: def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
assert picture.image is not None # Note: technically we could make a batch request here,
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
for image in images:
img_io = io.BytesIO() img_io = io.BytesIO()
assert picture.image.pil_image is not None image.save(img_io, "PNG")
picture.image.pil_image.save(img_io, "PNG")
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8") image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
messages = [ messages = [
@ -66,11 +67,13 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": self.options.llm_prompt, "text": self.options.prompt,
}, },
{ {
"type": "image_url", "type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_base64}"}, "image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
}, },
], ],
} }
@ -93,8 +96,4 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
api_resp = ApiResponse.model_validate_json(r.text) api_resp = ApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip() generated_text = api_resp.choices[0].message.content.strip()
yield generated_text
return PictureDescriptionData(
provenance=self.options.provenance,
text=generated_text,
)

View File

@ -0,0 +1,52 @@
import logging
from pathlib import Path
from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, PicDescApiOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/2206.01062.pdf")
# This is using a local API server to do picture description.
# For example, you can launch it locally with:
# $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct"
pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = PicDescApiOptions(
url="http://localhost:8000/v1/chat/completions",
params=dict(
model="HuggingFaceTB/SmolVLM-256M-Instruct",
seed=42,
max_completion_tokens=200,
),
prompt="Describe the image in three sentences. Be consise and accurate.",
timeout=90,
)
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
result = doc_converter.convert(input_doc_path)
for element, _level in result.document.iterate_items():
if isinstance(element, PictureItem):
print(
f"Picture {element.self_ref}\n"
f"Caption: {element.caption_text(doc=result.document)}\n"
f"Annotations: {element.annotations}"
)
if __name__ == "__main__":
main()