From 8ac000e35ea75768af3fd633fcae66da8524354a Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Thu, 6 Feb 2025 13:51:41 +0100 Subject: [PATCH] update vlm API Signed-off-by: Michele Dolfi --- docling/datamodel/pipeline_options.py | 4 +- docling/models/pic_description_api_model.py | 87 ++++++++++----------- docs/examples/pictures_description_api.py | 52 ++++++++++++ 3 files changed, 97 insertions(+), 46 deletions(-) create mode 100644 docs/examples/pictures_description_api.py diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 02852a6d..7e7d5378 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -197,12 +197,12 @@ class PicDescBaseOptions(BaseModel): class PicDescApiOptions(PicDescBaseOptions): kind: Literal["api"] = "api" - url: AnyUrl = AnyUrl("http://localhost/") + url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions") headers: Dict[str, str] = {} params: Dict[str, Any] = {} timeout: float = 20 - llm_prompt: str = "" + prompt: str = "Describe this image in a few sentences." provenance: str = "" diff --git a/docling/models/pic_description_api_model.py b/docling/models/pic_description_api_model.py index 6300a209..a68b2b4e 100644 --- a/docling/models/pic_description_api_model.py +++ b/docling/models/pic_description_api_model.py @@ -1,13 +1,14 @@ import base64 import io import logging -from typing import List, Optional +from typing import Iterable, List, Optional import httpx from docling_core.types.doc import PictureItem from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc PictureDescriptionData, ) +from PIL import Image from pydantic import BaseModel, ConfigDict from docling.datamodel.pipeline_options import PicDescApiOptions @@ -39,62 +40,60 @@ class ApiResponse(BaseModel): ) id: str - model: Optional[str] = None # returned bu openai + model: Optional[str] = None # returned by openai choices: List[ResponseChoice] created: int usage: ResponseUsage class PictureDescriptionApiModel(PictureDescriptionBaseModel): + # elements_batch_size = 4 def __init__(self, enabled: bool, options: PicDescApiOptions): super().__init__(enabled=enabled, options=options) self.options: PicDescApiOptions - def _annotate_image(self, picture: PictureItem) -> PictureDescriptionData: - assert picture.image is not None + def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: + # Note: technically we could make a batch request here, + # but not all APIs will allow for it. For example, vllm won't allow more than 1. + for image in images: + img_io = io.BytesIO() + image.save(img_io, "PNG") + image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8") - img_io = io.BytesIO() - assert picture.image.pil_image is not None - picture.image.pil_image.save(img_io, "PNG") + messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": self.options.prompt, + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{image_base64}" + }, + }, + ], + } + ] - image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8") - - messages = [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": self.options.llm_prompt, - }, - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{image_base64}"}, - }, - ], + payload = { + "messages": messages, + **self.options.params, } - ] - payload = { - "messages": messages, - **self.options.params, - } + r = httpx.post( + str(self.options.url), + headers=self.options.headers, + json=payload, + timeout=self.options.timeout, + ) + if not r.is_success: + _log.error(f"Error calling the API. Reponse was {r.text}") + r.raise_for_status() - r = httpx.post( - str(self.options.url), - headers=self.options.headers, - json=payload, - timeout=self.options.timeout, - ) - if not r.is_success: - _log.error(f"Error calling the API. Reponse was {r.text}") - r.raise_for_status() - - api_resp = ApiResponse.model_validate_json(r.text) - generated_text = api_resp.choices[0].message.content.strip() - - return PictureDescriptionData( - provenance=self.options.provenance, - text=generated_text, - ) + api_resp = ApiResponse.model_validate_json(r.text) + generated_text = api_resp.choices[0].message.content.strip() + yield generated_text diff --git a/docs/examples/pictures_description_api.py b/docs/examples/pictures_description_api.py new file mode 100644 index 00000000..7c42162c --- /dev/null +++ b/docs/examples/pictures_description_api.py @@ -0,0 +1,52 @@ +import logging +from pathlib import Path + +from docling_core.types.doc import PictureItem + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions, PicDescApiOptions +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_path = Path("./tests/data/2206.01062.pdf") + + # This is using a local API server to do picture description. + # For example, you can launch it locally with: + # $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct" + + pipeline_options = PdfPipelineOptions() + pipeline_options.do_picture_description = True + pipeline_options.picture_description_options = PicDescApiOptions( + url="http://localhost:8000/v1/chat/completions", + params=dict( + model="HuggingFaceTB/SmolVLM-256M-Instruct", + seed=42, + max_completion_tokens=200, + ), + prompt="Describe the image in three sentences. Be consise and accurate.", + timeout=90, + ) + + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ) + } + ) + result = doc_converter.convert(input_doc_path) + + for element, _level in result.document.iterate_items(): + if isinstance(element, PictureItem): + print( + f"Picture {element.self_ref}\n" + f"Caption: {element.caption_text(doc=result.document)}\n" + f"Annotations: {element.annotations}" + ) + + +if __name__ == "__main__": + main()