diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 63e88a66..9791a251 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -285,10 +285,12 @@ class HuggingFaceVlmOptions(BaseVlmOptions): return self.repo_id.replace("/", "--") -class OpenAiVlmOptions(BaseVlmOptions): - kind: Literal["openai_model_options"] = "openai_model_options" +class ApiVlmOptions(BaseVlmOptions): + kind: Literal["api_model_options"] = "api_model_options" - url: AnyUrl = AnyUrl("http://localhost:11434/v1/chat/completions") # Default to ollama + url: AnyUrl = AnyUrl( + "http://localhost:11434/v1/chat/completions" + ) # Default to ollama headers: Dict[str, str] = {} params: Dict[str, Any] = {} scale: float = 2.0 @@ -319,7 +321,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( inference_framework=InferenceFramework.TRANSFORMERS, ) -granite_vision_vlm_ollama_conversion_options = OpenAiVlmOptions( +granite_vision_vlm_ollama_conversion_options = ApiVlmOptions( url=AnyUrl("http://localhost:11434/v1/chat/completions"), params={"model": "granite3.2-vision:2b"}, prompt="OCR the full page to markdown.", @@ -384,7 +386,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions): False # (To be used with vlms, or other generative models) ) # If True, text from backend will be used instead of generated text - vlm_options: Union[HuggingFaceVlmOptions, OpenAiVlmOptions] = ( + vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = ( smoldocling_vlm_conversion_options ) diff --git a/docling/models/openai_vlm_model.py b/docling/models/api_vlm_model.py similarity index 87% rename from docling/models/openai_vlm_model.py rename to docling/models/api_vlm_model.py index bf06fb2c..8562bd14 100644 --- a/docling/models/openai_vlm_model.py +++ b/docling/models/api_vlm_model.py @@ -2,18 +2,18 @@ from typing import Iterable from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import OpenAiVlmOptions +from docling.datamodel.pipeline_options import ApiVlmOptions from docling.models.base_model import BasePageModel +from docling.utils.api_image_request import api_image_request from docling.utils.profiling import TimeRecorder -from docling.utils.utils import openai_image_request -class OpenAiVlmModel(BasePageModel): +class ApiVlmModel(BasePageModel): def __init__( self, enabled: bool, - vlm_options: OpenAiVlmOptions, + vlm_options: ApiVlmOptions, ): self.enabled = enabled self.vlm_options = vlm_options @@ -44,7 +44,7 @@ class OpenAiVlmModel(BasePageModel): if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") - page_tags = openai_image_request( + page_tags = api_image_request( image=hi_res_image, prompt=self.prompt_content, url=self.vlm_options.url, diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py index 9a9771ae..1aa73518 100644 --- a/docling/models/picture_description_api_model.py +++ b/docling/models/picture_description_api_model.py @@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import ( ) from docling.exceptions import OperationNotAllowed from docling.models.picture_description_base_model import PictureDescriptionBaseModel -from docling.utils.utils import openai_image_request +from docling.utils.api_image_request import api_image_request class PictureDescriptionApiModel(PictureDescriptionBaseModel): @@ -48,7 +48,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel): # Note: technically we could make a batch request here, # but not all APIs will allow for it. For example, vllm won't allow more than 1. for image in images: - yield openai_image_request( + yield api_image_request( image=image, prompt=self.options.prompt, url=self.options.url, diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index dcfd92a4..c1504be3 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -15,16 +15,16 @@ from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.pipeline_options import ( + ApiVlmOptions, HuggingFaceVlmOptions, InferenceFramework, - OpenAiVlmOptions, ResponseFormat, VlmPipelineOptions, ) from docling.datamodel.settings import settings +from docling.models.api_vlm_model import ApiVlmModel from docling.models.hf_mlx_model import HuggingFaceMlxModel from docling.models.hf_vlm_model import HuggingFaceVlmModel -from docling.models.openai_vlm_model import OpenAiVlmModel from docling.pipeline.base_pipeline import PaginatedPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -60,13 +60,11 @@ class VlmPipeline(PaginatedPipeline): self.keep_images = self.pipeline_options.generate_page_images - if isinstance(pipeline_options.vlm_options, OpenAiVlmOptions): + if isinstance(pipeline_options.vlm_options, ApiVlmOptions): self.build_pipe = [ - OpenAiVlmModel( + ApiVlmModel( enabled=True, # must be always enabled for this pipeline to make sense. - vlm_options=cast( - OpenAiVlmOptions, self.pipeline_options.vlm_options - ), + vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options), ), ] elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions): diff --git a/docling/utils/api_image_request.py b/docling/utils/api_image_request.py new file mode 100644 index 00000000..9227389c --- /dev/null +++ b/docling/utils/api_image_request.py @@ -0,0 +1,61 @@ +import base64 +import logging +from io import BytesIO +from typing import Dict, Optional + +import requests +from PIL import Image +from pydantic import AnyUrl + +from docling.datamodel.base_models import OpenAiApiResponse + +_log = logging.getLogger(__name__) + + +def api_image_request( + image: Image.Image, + prompt: str, + url: AnyUrl, + timeout: float = 20, + headers: Optional[Dict[str, str]] = None, + **params, +) -> str: + img_io = BytesIO() + image.save(img_io, "PNG") + image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8") + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{image_base64}"}, + }, + { + "type": "text", + "text": prompt, + }, + ], + } + ] + + payload = { + "messages": messages, + **params, + } + + headers = headers or {} + + r = requests.post( + str(url), + headers=headers, + json=payload, + timeout=timeout, + ) + if not r.ok: + _log.error(f"Error calling the API. Response was {r.text}") + r.raise_for_status() + + api_resp = OpenAiApiResponse.model_validate_json(r.text) + generated_text = api_resp.choices[0].message.content.strip() + return generated_text diff --git a/docling/utils/utils.py b/docling/utils/utils.py index 51dca03a..1261f860 100644 --- a/docling/utils/utils.py +++ b/docling/utils/utils.py @@ -1,20 +1,12 @@ -import base64 import hashlib -import logging from io import BytesIO from itertools import islice from pathlib import Path -from typing import Dict, List, Optional, Union +from typing import List, Union import requests -from PIL import Image -from pydantic import AnyUrl from tqdm import tqdm -from docling.datamodel.base_models import OpenAiApiResponse - -_log = logging.getLogger(__name__) - def chunkify(iterator, chunk_size): """Yield successive chunks of chunk_size from the iterable.""" @@ -71,52 +63,3 @@ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO: buf.seek(0) return buf - - -def openai_image_request( - image: Image.Image, - prompt: str, - url: AnyUrl, - timeout: float = 20, - headers: Optional[Dict[str, str]] = None, - **params, -) -> str: - img_io = BytesIO() - image.save(img_io, "PNG") - image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8") - messages = [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{image_base64}"}, - }, - { - "type": "text", - "text": prompt, - }, - ], - } - ] - - payload = { - "messages": messages, - **params, - } - - headers = headers or {} - - r = requests.post( - str(url), - headers=headers, - json=payload, - timeout=timeout, - ) - if not r.ok: - _log.error(f"Error calling the API. Response was {r.text}") - r.raise_for_status() - - api_resp = OpenAiApiResponse.model_validate_json(r.text) - generated_text = api_resp.choices[0].message.content.strip() - return generated_text