rename and refactor

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-04-10 11:33:00 +02:00
parent 0f438b3a76
commit f77c8cf96c
6 changed files with 81 additions and 77 deletions

View File

@ -285,10 +285,12 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
return self.repo_id.replace("/", "--")
class OpenAiVlmOptions(BaseVlmOptions):
kind: Literal["openai_model_options"] = "openai_model_options"
class ApiVlmOptions(BaseVlmOptions):
kind: Literal["api_model_options"] = "api_model_options"
url: AnyUrl = AnyUrl("http://localhost:11434/v1/chat/completions") # Default to ollama
url: AnyUrl = AnyUrl(
"http://localhost:11434/v1/chat/completions"
) # Default to ollama
headers: Dict[str, str] = {}
params: Dict[str, Any] = {}
scale: float = 2.0
@ -319,7 +321,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
inference_framework=InferenceFramework.TRANSFORMERS,
)
granite_vision_vlm_ollama_conversion_options = OpenAiVlmOptions(
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
params={"model": "granite3.2-vision:2b"},
prompt="OCR the full page to markdown.",
@ -384,7 +386,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
vlm_options: Union[HuggingFaceVlmOptions, OpenAiVlmOptions] = (
vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
smoldocling_vlm_conversion_options
)

View File

@ -2,18 +2,18 @@ from typing import Iterable
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OpenAiVlmOptions
from docling.datamodel.pipeline_options import ApiVlmOptions
from docling.models.base_model import BasePageModel
from docling.utils.api_image_request import api_image_request
from docling.utils.profiling import TimeRecorder
from docling.utils.utils import openai_image_request
class OpenAiVlmModel(BasePageModel):
class ApiVlmModel(BasePageModel):
def __init__(
self,
enabled: bool,
vlm_options: OpenAiVlmOptions,
vlm_options: ApiVlmOptions,
):
self.enabled = enabled
self.vlm_options = vlm_options
@ -44,7 +44,7 @@ class OpenAiVlmModel(BasePageModel):
if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB")
page_tags = openai_image_request(
page_tags = api_image_request(
image=hi_res_image,
prompt=self.prompt_content,
url=self.vlm_options.url,

View File

@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import (
)
from docling.exceptions import OperationNotAllowed
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
from docling.utils.utils import openai_image_request
from docling.utils.api_image_request import api_image_request
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
@ -48,7 +48,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
# Note: technically we could make a batch request here,
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
for image in images:
yield openai_image_request(
yield api_image_request(
image=image,
prompt=self.options.prompt,
url=self.options.url,

View File

@ -15,16 +15,16 @@ from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import InputFormat, Page
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import (
ApiVlmOptions,
HuggingFaceVlmOptions,
InferenceFramework,
OpenAiVlmOptions,
ResponseFormat,
VlmPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.models.api_vlm_model import ApiVlmModel
from docling.models.hf_mlx_model import HuggingFaceMlxModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel
from docling.models.openai_vlm_model import OpenAiVlmModel
from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -60,13 +60,11 @@ class VlmPipeline(PaginatedPipeline):
self.keep_images = self.pipeline_options.generate_page_images
if isinstance(pipeline_options.vlm_options, OpenAiVlmOptions):
if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
self.build_pipe = [
OpenAiVlmModel(
ApiVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense.
vlm_options=cast(
OpenAiVlmOptions, self.pipeline_options.vlm_options
),
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
),
]
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):

View File

@ -0,0 +1,61 @@
import base64
import logging
from io import BytesIO
from typing import Dict, Optional
import requests
from PIL import Image
from pydantic import AnyUrl
from docling.datamodel.base_models import OpenAiApiResponse
_log = logging.getLogger(__name__)
def api_image_request(
image: Image.Image,
prompt: str,
url: AnyUrl,
timeout: float = 20,
headers: Optional[Dict[str, str]] = None,
**params,
) -> str:
img_io = BytesIO()
image.save(img_io, "PNG")
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
},
{
"type": "text",
"text": prompt,
},
],
}
]
payload = {
"messages": messages,
**params,
}
headers = headers or {}
r = requests.post(
str(url),
headers=headers,
json=payload,
timeout=timeout,
)
if not r.ok:
_log.error(f"Error calling the API. Response was {r.text}")
r.raise_for_status()
api_resp = OpenAiApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip()
return generated_text

View File

@ -1,20 +1,12 @@
import base64
import hashlib
import logging
from io import BytesIO
from itertools import islice
from pathlib import Path
from typing import Dict, List, Optional, Union
from typing import List, Union
import requests
from PIL import Image
from pydantic import AnyUrl
from tqdm import tqdm
from docling.datamodel.base_models import OpenAiApiResponse
_log = logging.getLogger(__name__)
def chunkify(iterator, chunk_size):
"""Yield successive chunks of chunk_size from the iterable."""
@ -71,52 +63,3 @@ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
buf.seek(0)
return buf
def openai_image_request(
image: Image.Image,
prompt: str,
url: AnyUrl,
timeout: float = 20,
headers: Optional[Dict[str, str]] = None,
**params,
) -> str:
img_io = BytesIO()
image.save(img_io, "PNG")
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
},
{
"type": "text",
"text": prompt,
},
],
}
]
payload = {
"messages": messages,
**params,
}
headers = headers or {}
r = requests.post(
str(url),
headers=headers,
json=payload,
timeout=timeout,
)
if not r.ok:
_log.error(f"Error calling the API. Response was {r.text}")
r.raise_for_status()
api_resp = OpenAiApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip()
return generated_text