mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-16 16:48:21 +00:00
feat: Describe pictures using vision models (#259)
* draft for picture description models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * vlm description using AutoModelForVision2Seq Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add generation options Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update vlm API Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * allow only localhost traffic Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename model Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * do not run with vlm api Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * more renaming Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix examples path Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply CLI download login Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix name of cli argument Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use with_smolvlm in models download Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Generic, Iterable, Optional
|
||||
|
||||
from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem, TextItem
|
||||
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
||||
@@ -64,7 +64,7 @@ class BaseItemAndImageEnrichmentModel(
|
||||
if not self.is_processable(doc=conv_res.document, element=element):
|
||||
return None
|
||||
|
||||
assert isinstance(element, TextItem)
|
||||
assert isinstance(element, DocItem)
|
||||
element_prov = element.prov[0]
|
||||
|
||||
bbox = element_prov.bbox
|
||||
|
||||
105
docling/models/picture_description_api_model.py
Normal file
105
docling/models/picture_description_api_model.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
import httpx
|
||||
from docling_core.types.doc import PictureItem
|
||||
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
||||
PictureDescriptionData,
|
||||
)
|
||||
from PIL import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
|
||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChatMessage(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class ResponseChoice(BaseModel):
|
||||
index: int
|
||||
message: ChatMessage
|
||||
finish_reason: str
|
||||
|
||||
|
||||
class ResponseUsage(BaseModel):
|
||||
prompt_tokens: int
|
||||
completion_tokens: int
|
||||
total_tokens: int
|
||||
|
||||
|
||||
class ApiResponse(BaseModel):
|
||||
model_config = ConfigDict(
|
||||
protected_namespaces=(),
|
||||
)
|
||||
|
||||
id: str
|
||||
model: Optional[str] = None # returned by openai
|
||||
choices: List[ResponseChoice]
|
||||
created: int
|
||||
usage: ResponseUsage
|
||||
|
||||
|
||||
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||
# elements_batch_size = 4
|
||||
|
||||
def __init__(self, enabled: bool, options: PictureDescriptionApiOptions):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: PictureDescriptionApiOptions
|
||||
|
||||
if self.enabled:
|
||||
if options.url.host != "localhost":
|
||||
raise NotImplementedError(
|
||||
"The options try to connect to remote APIs which are not yet allowed."
|
||||
)
|
||||
|
||||
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||
# Note: technically we could make a batch request here,
|
||||
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
||||
for image in images:
|
||||
img_io = io.BytesIO()
|
||||
image.save(img_io, "PNG")
|
||||
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": self.options.prompt,
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{image_base64}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
payload = {
|
||||
"messages": messages,
|
||||
**self.options.params,
|
||||
}
|
||||
|
||||
r = httpx.post(
|
||||
str(self.options.url),
|
||||
headers=self.options.headers,
|
||||
json=payload,
|
||||
timeout=self.options.timeout,
|
||||
)
|
||||
if not r.is_success:
|
||||
_log.error(f"Error calling the API. Reponse was {r.text}")
|
||||
r.raise_for_status()
|
||||
|
||||
api_resp = ApiResponse.model_validate_json(r.text)
|
||||
generated_text = api_resp.choices[0].message.content.strip()
|
||||
yield generated_text
|
||||
64
docling/models/picture_description_base_model.py
Normal file
64
docling/models/picture_description_base_model.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, List, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DoclingDocument,
|
||||
NodeItem,
|
||||
PictureClassificationClass,
|
||||
PictureItem,
|
||||
)
|
||||
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
||||
PictureDescriptionData,
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
|
||||
from docling.models.base_model import (
|
||||
BaseItemAndImageEnrichmentModel,
|
||||
ItemAndImageEnrichmentElement,
|
||||
)
|
||||
|
||||
|
||||
class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
|
||||
images_scale: float = 2.0
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
options: PictureDescriptionBaseOptions,
|
||||
):
|
||||
self.enabled = enabled
|
||||
self.options = options
|
||||
self.provenance = "not-implemented"
|
||||
|
||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||
return self.enabled and isinstance(element, PictureItem)
|
||||
|
||||
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||
raise NotImplementedError
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
||||
) -> Iterable[NodeItem]:
|
||||
if not self.enabled:
|
||||
for element in element_batch:
|
||||
yield element.item
|
||||
return
|
||||
|
||||
images: List[Image.Image] = []
|
||||
elements: List[PictureItem] = []
|
||||
for el in element_batch:
|
||||
assert isinstance(el.item, PictureItem)
|
||||
elements.append(el.item)
|
||||
images.append(el.image)
|
||||
|
||||
outputs = self._annotate_images(images)
|
||||
|
||||
for item, output in zip(elements, outputs):
|
||||
item.annotations.append(
|
||||
PictureDescriptionData(text=output, provenance=self.provenance)
|
||||
)
|
||||
yield item
|
||||
109
docling/models/picture_description_vlm_model.py
Normal file
109
docling/models/picture_description_vlm_model.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Union
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorOptions,
|
||||
PictureDescriptionVlmOptions,
|
||||
)
|
||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
|
||||
|
||||
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
artifacts_path: Optional[Union[Path, str]],
|
||||
options: PictureDescriptionVlmOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: PictureDescriptionVlmOptions
|
||||
|
||||
if self.enabled:
|
||||
|
||||
if artifacts_path is None:
|
||||
artifacts_path = self.download_models(repo_id=self.options.repo_id)
|
||||
else:
|
||||
artifacts_path = Path(artifacts_path) / self.options.repo_cache_folder
|
||||
|
||||
self.device = decide_device(accelerator_options.device)
|
||||
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModelForVision2Seq, AutoProcessor
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"transformers >=4.46 is not installed. Please install Docling with the required extras `pip install docling[vlm]`."
|
||||
)
|
||||
|
||||
# Initialize processor and model
|
||||
self.processor = AutoProcessor.from_pretrained(self.options.repo_id)
|
||||
self.model = AutoModelForVision2Seq.from_pretrained(
|
||||
self.options.repo_id,
|
||||
torch_dtype=torch.bfloat16,
|
||||
_attn_implementation=(
|
||||
"flash_attention_2" if self.device.startswith("cuda") else "eager"
|
||||
),
|
||||
).to(self.device)
|
||||
|
||||
self.provenance = f"{self.options.repo_id}"
|
||||
|
||||
@staticmethod
|
||||
def download_models(
|
||||
repo_id: str,
|
||||
local_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
progress: bool = False,
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
if not progress:
|
||||
disable_progress_bars()
|
||||
download_path = snapshot_download(
|
||||
repo_id=repo_id,
|
||||
force_download=force,
|
||||
local_dir=local_dir,
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
|
||||
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||
from transformers import GenerationConfig
|
||||
|
||||
# Create input messages
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image"},
|
||||
{"type": "text", "text": self.options.prompt},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
# TODO: do batch generation
|
||||
|
||||
for image in images:
|
||||
# Prepare inputs
|
||||
prompt = self.processor.apply_chat_template(
|
||||
messages, add_generation_prompt=True
|
||||
)
|
||||
inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
|
||||
inputs = inputs.to(self.device)
|
||||
|
||||
# Generate outputs
|
||||
generated_ids = self.model.generate(
|
||||
**inputs,
|
||||
generation_config=GenerationConfig(**self.options.generation_config),
|
||||
)
|
||||
generated_texts = self.processor.batch_decode(
|
||||
generated_ids[:, inputs["input_ids"].shape[1] :],
|
||||
skip_special_tokens=True,
|
||||
)
|
||||
|
||||
yield generated_texts[0].strip()
|
||||
Reference in New Issue
Block a user