mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
update vlm API
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
06342a5a28
commit
8ac000e35e
@ -197,12 +197,12 @@ class PicDescBaseOptions(BaseModel):
|
|||||||
class PicDescApiOptions(PicDescBaseOptions):
|
class PicDescApiOptions(PicDescBaseOptions):
|
||||||
kind: Literal["api"] = "api"
|
kind: Literal["api"] = "api"
|
||||||
|
|
||||||
url: AnyUrl = AnyUrl("http://localhost/")
|
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
||||||
headers: Dict[str, str] = {}
|
headers: Dict[str, str] = {}
|
||||||
params: Dict[str, Any] = {}
|
params: Dict[str, Any] = {}
|
||||||
timeout: float = 20
|
timeout: float = 20
|
||||||
|
|
||||||
llm_prompt: str = ""
|
prompt: str = "Describe this image in a few sentences."
|
||||||
provenance: str = ""
|
provenance: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,13 +1,14 @@
|
|||||||
import base64
|
import base64
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
from typing import List, Optional
|
from typing import Iterable, List, Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from docling_core.types.doc import PictureItem
|
from docling_core.types.doc import PictureItem
|
||||||
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
||||||
PictureDescriptionData,
|
PictureDescriptionData,
|
||||||
)
|
)
|
||||||
|
from PIL import Image
|
||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
from docling.datamodel.pipeline_options import PicDescApiOptions
|
from docling.datamodel.pipeline_options import PicDescApiOptions
|
||||||
@ -39,62 +40,60 @@ class ApiResponse(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
id: str
|
id: str
|
||||||
model: Optional[str] = None # returned bu openai
|
model: Optional[str] = None # returned by openai
|
||||||
choices: List[ResponseChoice]
|
choices: List[ResponseChoice]
|
||||||
created: int
|
created: int
|
||||||
usage: ResponseUsage
|
usage: ResponseUsage
|
||||||
|
|
||||||
|
|
||||||
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||||
|
# elements_batch_size = 4
|
||||||
|
|
||||||
def __init__(self, enabled: bool, options: PicDescApiOptions):
|
def __init__(self, enabled: bool, options: PicDescApiOptions):
|
||||||
super().__init__(enabled=enabled, options=options)
|
super().__init__(enabled=enabled, options=options)
|
||||||
self.options: PicDescApiOptions
|
self.options: PicDescApiOptions
|
||||||
|
|
||||||
def _annotate_image(self, picture: PictureItem) -> PictureDescriptionData:
|
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||||
assert picture.image is not None
|
# Note: technically we could make a batch request here,
|
||||||
|
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
||||||
|
for image in images:
|
||||||
|
img_io = io.BytesIO()
|
||||||
|
image.save(img_io, "PNG")
|
||||||
|
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
||||||
|
|
||||||
img_io = io.BytesIO()
|
messages = [
|
||||||
assert picture.image.pil_image is not None
|
{
|
||||||
picture.image.pil_image.save(img_io, "PNG")
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": self.options.prompt,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/png;base64,{image_base64}"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
payload = {
|
||||||
|
"messages": messages,
|
||||||
messages = [
|
**self.options.params,
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": self.options.llm_prompt,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "image_url",
|
|
||||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
}
|
||||||
]
|
|
||||||
|
|
||||||
payload = {
|
r = httpx.post(
|
||||||
"messages": messages,
|
str(self.options.url),
|
||||||
**self.options.params,
|
headers=self.options.headers,
|
||||||
}
|
json=payload,
|
||||||
|
timeout=self.options.timeout,
|
||||||
|
)
|
||||||
|
if not r.is_success:
|
||||||
|
_log.error(f"Error calling the API. Reponse was {r.text}")
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
r = httpx.post(
|
api_resp = ApiResponse.model_validate_json(r.text)
|
||||||
str(self.options.url),
|
generated_text = api_resp.choices[0].message.content.strip()
|
||||||
headers=self.options.headers,
|
yield generated_text
|
||||||
json=payload,
|
|
||||||
timeout=self.options.timeout,
|
|
||||||
)
|
|
||||||
if not r.is_success:
|
|
||||||
_log.error(f"Error calling the API. Reponse was {r.text}")
|
|
||||||
r.raise_for_status()
|
|
||||||
|
|
||||||
api_resp = ApiResponse.model_validate_json(r.text)
|
|
||||||
generated_text = api_resp.choices[0].message.content.strip()
|
|
||||||
|
|
||||||
return PictureDescriptionData(
|
|
||||||
provenance=self.options.provenance,
|
|
||||||
text=generated_text,
|
|
||||||
)
|
|
||||||
|
52
docs/examples/pictures_description_api.py
Normal file
52
docs/examples/pictures_description_api.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling_core.types.doc import PictureItem
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, PicDescApiOptions
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||||
|
|
||||||
|
# This is using a local API server to do picture description.
|
||||||
|
# For example, you can launch it locally with:
|
||||||
|
# $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct"
|
||||||
|
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.do_picture_description = True
|
||||||
|
pipeline_options.picture_description_options = PicDescApiOptions(
|
||||||
|
url="http://localhost:8000/v1/chat/completions",
|
||||||
|
params=dict(
|
||||||
|
model="HuggingFaceTB/SmolVLM-256M-Instruct",
|
||||||
|
seed=42,
|
||||||
|
max_completion_tokens=200,
|
||||||
|
),
|
||||||
|
prompt="Describe the image in three sentences. Be consise and accurate.",
|
||||||
|
timeout=90,
|
||||||
|
)
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
result = doc_converter.convert(input_doc_path)
|
||||||
|
|
||||||
|
for element, _level in result.document.iterate_items():
|
||||||
|
if isinstance(element, PictureItem):
|
||||||
|
print(
|
||||||
|
f"Picture {element.self_ref}\n"
|
||||||
|
f"Caption: {element.caption_text(doc=result.document)}\n"
|
||||||
|
f"Annotations: {element.annotations}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user