fix: Linting, formatting, and bug fixes

The one bug fix was in the timeout arg to openai_image_request. Otherwise,
this is all style changes to get MyPy and black passing cleanly.

Branch: OllamaVlmModel

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
This commit is contained in:
Gabe Goodhart 2025-04-09 11:54:22 -06:00
parent 7b7a3a2004
commit f14c1b4f05
7 changed files with 50 additions and 38 deletions

View File

@ -537,7 +537,9 @@ def convert(
if vlm_model == VlmModelType.GRANITE_VISION:
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
pipeline_options.vlm_options = granite_vision_vlm_ollama_conversion_options
pipeline_options.vlm_options = (
granite_vision_vlm_ollama_conversion_options
)
elif vlm_model == VlmModelType.SMOLDOCLING:
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
if sys.platform == "darwin":

View File

@ -266,6 +266,7 @@ class Page(BaseModel):
## OpenAI API Request / Response Models ##
class OpenAiChatMessage(BaseModel):
role: str
content: str

View File

@ -289,8 +289,8 @@ class OpenAiVlmOptions(BaseVlmOptions):
kind: Literal["openai_model_options"] = "openai_model_options"
model_id: str
base_url: str = "http://localhost:11434/v1" # Default to ollama
apikey: str | None = None,
base_url: str = "http://localhost:11434/v1" # Default to ollama
apikey: Optional[str] = None
scale: float = 2.0
timeout: float = 60
response_format: ResponseFormat
@ -322,8 +322,8 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
granite_vision_vlm_ollama_conversion_options = OpenAiVlmOptions(
model_id="granite3.2-vision:2b",
prompt="OCR the full page to markdown.",
scale = 1.0,
timeout = 120,
scale=1.0,
timeout=120,
response_format=ResponseFormat.MARKDOWN,
)
@ -383,7 +383,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
vlm_options: Union[HuggingFaceVlmOptions, OpenAiVlmOptions] = (
smoldocling_vlm_conversion_options
)
class PdfPipelineOptions(PaginatedPipelineOptions):

View File

@ -18,11 +18,15 @@ class OpenAiVlmModel(BasePageModel):
self.enabled = enabled
self.vlm_options = vlm_options
if self.enabled:
self.url = "/".join([self.vlm_options.base_url.rstrip("/"), "chat/completions"])
self.url = "/".join(
[self.vlm_options.base_url.rstrip("/"), "chat/completions"]
)
self.apikey = self.vlm_options.apikey
self.model_id = self.vlm_options.model_id
self.timeout = self.vlm_options.timeout
self.prompt_content = f"This is a page from a document.\n{self.vlm_options.prompt}"
self.prompt_content = (
f"This is a page from a document.\n{self.vlm_options.prompt}"
)
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
@ -36,6 +40,7 @@ class OpenAiVlmModel(BasePageModel):
assert page.size is not None
hi_res_image = page.get_image(scale=self.vlm_options.scale)
assert hi_res_image is not None
if hi_res_image:
if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB")

View File

@ -52,7 +52,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
image=image,
prompt=self.options.prompt,
url=self.options.url,
timeout=self.options.headers,
timeout=self.options.timeout,
headers=self.options.headers,
**self.options.params,
)

View File

@ -64,30 +64,31 @@ class VlmPipeline(PaginatedPipeline):
self.build_pipe = [
OpenAiVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense.
vlm_options=self.pipeline_options.vlm_options,
),
]
elif (
self.pipeline_options.vlm_options.inference_framework
== InferenceFramework.MLX
):
self.build_pipe = [
HuggingFaceMlxModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=self.pipeline_options.vlm_options,
),
]
elif isinstance(pipeline_options.vlm_options, HuggingFaceVlmOptions):
self.build_pipe = [
HuggingFaceVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=self.pipeline_options.vlm_options,
vlm_options=cast(
OpenAiVlmOptions, self.pipeline_options.vlm_options
),
),
]
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
if vlm_options.inference_framework == InferenceFramework.MLX:
self.build_pipe = [
HuggingFaceMlxModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=vlm_options,
),
]
else:
self.build_pipe = [
HuggingFaceVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=vlm_options,
),
]
self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument

View File

@ -4,10 +4,11 @@ import logging
from io import BytesIO
from itertools import islice
from pathlib import Path
from typing import List, Union
from typing import Dict, List, Optional, Union
import requests
from PIL import Image
from pydantic import AnyUrl
from tqdm import tqdm
from docling.datamodel.base_models import OpenAiApiResponse
@ -75,10 +76,12 @@ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
def openai_image_request(
image: Image.Image,
prompt: str,
url: str = "http://localhost:11434/v1/chat/completions", # Default to ollama
apikey: str | None = None,
url: Union[
AnyUrl, str
] = "http://localhost:11434/v1/chat/completions", # Default to ollama
apikey: Optional[str] = None,
timeout: float = 20,
headers: dict[str, str] | None = None,
headers: Optional[Dict[str, str]] = None,
**params,
) -> str:
img_io = BytesIO()
@ -90,9 +93,7 @@ def openai_image_request(
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
},
{
"type": "text",