mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
update vlm API
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
06342a5a28
commit
8ac000e35e
@ -197,12 +197,12 @@ class PicDescBaseOptions(BaseModel):
|
||||
class PicDescApiOptions(PicDescBaseOptions):
|
||||
kind: Literal["api"] = "api"
|
||||
|
||||
url: AnyUrl = AnyUrl("http://localhost/")
|
||||
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
||||
headers: Dict[str, str] = {}
|
||||
params: Dict[str, Any] = {}
|
||||
timeout: float = 20
|
||||
|
||||
llm_prompt: str = ""
|
||||
prompt: str = "Describe this image in a few sentences."
|
||||
provenance: str = ""
|
||||
|
||||
|
||||
|
@ -1,13 +1,14 @@
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
import httpx
|
||||
from docling_core.types.doc import PictureItem
|
||||
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
||||
PictureDescriptionData,
|
||||
)
|
||||
from PIL import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.pipeline_options import PicDescApiOptions
|
||||
@ -39,25 +40,25 @@ class ApiResponse(BaseModel):
|
||||
)
|
||||
|
||||
id: str
|
||||
model: Optional[str] = None # returned bu openai
|
||||
model: Optional[str] = None # returned by openai
|
||||
choices: List[ResponseChoice]
|
||||
created: int
|
||||
usage: ResponseUsage
|
||||
|
||||
|
||||
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||
# elements_batch_size = 4
|
||||
|
||||
def __init__(self, enabled: bool, options: PicDescApiOptions):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: PicDescApiOptions
|
||||
|
||||
def _annotate_image(self, picture: PictureItem) -> PictureDescriptionData:
|
||||
assert picture.image is not None
|
||||
|
||||
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||
# Note: technically we could make a batch request here,
|
||||
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
||||
for image in images:
|
||||
img_io = io.BytesIO()
|
||||
assert picture.image.pil_image is not None
|
||||
picture.image.pil_image.save(img_io, "PNG")
|
||||
|
||||
image.save(img_io, "PNG")
|
||||
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
||||
|
||||
messages = [
|
||||
@ -66,11 +67,13 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": self.options.llm_prompt,
|
||||
"text": self.options.prompt,
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{image_base64}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
@ -93,8 +96,4 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||
|
||||
api_resp = ApiResponse.model_validate_json(r.text)
|
||||
generated_text = api_resp.choices[0].message.content.strip()
|
||||
|
||||
return PictureDescriptionData(
|
||||
provenance=self.options.provenance,
|
||||
text=generated_text,
|
||||
)
|
||||
yield generated_text
|
||||
|
52
docs/examples/pictures_description_api.py
Normal file
52
docs/examples/pictures_description_api.py
Normal file
@ -0,0 +1,52 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import PictureItem
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PicDescApiOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
|
||||
# This is using a local API server to do picture description.
|
||||
# For example, you can launch it locally with:
|
||||
# $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct"
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_picture_description = True
|
||||
pipeline_options.picture_description_options = PicDescApiOptions(
|
||||
url="http://localhost:8000/v1/chat/completions",
|
||||
params=dict(
|
||||
model="HuggingFaceTB/SmolVLM-256M-Instruct",
|
||||
seed=42,
|
||||
max_completion_tokens=200,
|
||||
),
|
||||
prompt="Describe the image in three sentences. Be consise and accurate.",
|
||||
timeout=90,
|
||||
)
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
}
|
||||
)
|
||||
result = doc_converter.convert(input_doc_path)
|
||||
|
||||
for element, _level in result.document.iterate_items():
|
||||
if isinstance(element, PictureItem):
|
||||
print(
|
||||
f"Picture {element.self_ref}\n"
|
||||
f"Caption: {element.caption_text(doc=result.document)}\n"
|
||||
f"Annotations: {element.annotations}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user