fix: Linting, formatting, and bug fixes

The one bug fix was in the timeout arg to openai_image_request. Otherwise, this is all style changes to get MyPy and black passing cleanly. Branch: OllamaVlmModel Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-04-09 11:54:22 -06:00 · 2025-04-09 11:54:22 -06:00 · f14c1b4f05
commit f14c1b4f05
parent 7b7a3a2004
7 changed files with 50 additions and 38 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -537,7 +537,9 @@ def convert(
            if vlm_model == VlmModelType.GRANITE_VISION:
                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
-                pipeline_options.vlm_options = granite_vision_vlm_ollama_conversion_options
+                pipeline_options.vlm_options = (
                    granite_vision_vlm_ollama_conversion_options
                )
            elif vlm_model == VlmModelType.SMOLDOCLING:
                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
                if sys.platform == "darwin":
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -266,6 +266,7 @@ class Page(BaseModel):
 ## OpenAI API Request / Response Models ##
 class OpenAiChatMessage(BaseModel):
    role: str
    content: str
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -289,8 +289,8 @@ class OpenAiVlmOptions(BaseVlmOptions):
    kind: Literal["openai_model_options"] = "openai_model_options"
    model_id: str
-    base_url: str = "http://localhost:11434/v1" # Default to ollama
+    base_url: str = "http://localhost:11434/v1"  # Default to ollama
-    apikey: str | None = None,
+    apikey: Optional[str] = None
    scale: float = 2.0
    timeout: float = 60
    response_format: ResponseFormat
@ -322,8 +322,8 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
 granite_vision_vlm_ollama_conversion_options = OpenAiVlmOptions(
    model_id="granite3.2-vision:2b",
    prompt="OCR the full page to markdown.",
-    scale = 1.0,
+    scale=1.0,
-    timeout = 120,
+    timeout=120,
    response_format=ResponseFormat.MARKDOWN,
 )
@ -383,7 +383,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
        False  # (To be used with vlms, or other generative models)
    )
    # If True, text from backend will be used instead of generated text
-    vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
+    vlm_options: Union[HuggingFaceVlmOptions, OpenAiVlmOptions] = (
        smoldocling_vlm_conversion_options
    )
 class PdfPipelineOptions(PaginatedPipelineOptions):
--- a/docling/models/openai_vlm_model.py
+++ b/docling/models/openai_vlm_model.py
@ -18,11 +18,15 @@ class OpenAiVlmModel(BasePageModel):
        self.enabled = enabled
        self.vlm_options = vlm_options
        if self.enabled:
-            self.url = "/".join([self.vlm_options.base_url.rstrip("/"), "chat/completions"])
+            self.url = "/".join(
                [self.vlm_options.base_url.rstrip("/"), "chat/completions"]
            )
            self.apikey = self.vlm_options.apikey
            self.model_id = self.vlm_options.model_id
            self.timeout = self.vlm_options.timeout
-            self.prompt_content = f"This is a page from a document.\n{self.vlm_options.prompt}"
+            self.prompt_content = (
                f"This is a page from a document.\n{self.vlm_options.prompt}"
            )
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
@ -36,6 +40,7 @@ class OpenAiVlmModel(BasePageModel):
                    assert page.size is not None
                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
                    assert hi_res_image is not None
                    if hi_res_image:
                        if hi_res_image.mode != "RGB":
                            hi_res_image = hi_res_image.convert("RGB")
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@ -52,7 +52,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
                image=image,
                prompt=self.options.prompt,
                url=self.options.url,
-                timeout=self.options.headers,
+                timeout=self.options.timeout,
                headers=self.options.headers,
                **self.options.params,
            )
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -64,30 +64,31 @@ class VlmPipeline(PaginatedPipeline):
            self.build_pipe = [
                OpenAiVlmModel(
                    enabled=True,  # must be always enabled for this pipeline to make sense.
-                    vlm_options=self.pipeline_options.vlm_options,
+                    vlm_options=cast(
-                ),
+                        OpenAiVlmOptions, self.pipeline_options.vlm_options
-            ]
+                    ),
        elif (
            self.pipeline_options.vlm_options.inference_framework
            == InferenceFramework.MLX
        ):
            self.build_pipe = [
                HuggingFaceMlxModel(
                    enabled=True,  # must be always enabled for this pipeline to make sense.
                    artifacts_path=artifacts_path,
                    accelerator_options=pipeline_options.accelerator_options,
                    vlm_options=self.pipeline_options.vlm_options,
                ),
            ]
        elif isinstance(pipeline_options.vlm_options, HuggingFaceVlmOptions):
            self.build_pipe = [
                HuggingFaceVlmModel(
                    enabled=True,  # must be always enabled for this pipeline to make sense.
                    artifacts_path=artifacts_path,
                    accelerator_options=pipeline_options.accelerator_options,
                    vlm_options=self.pipeline_options.vlm_options,
                ),
            ]
        elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
            vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
            if vlm_options.inference_framework == InferenceFramework.MLX:
                self.build_pipe = [
                    HuggingFaceMlxModel(
                        enabled=True,  # must be always enabled for this pipeline to make sense.
                        artifacts_path=artifacts_path,
                        accelerator_options=pipeline_options.accelerator_options,
                        vlm_options=vlm_options,
                    ),
                ]
            else:
                self.build_pipe = [
                    HuggingFaceVlmModel(
                        enabled=True,  # must be always enabled for this pipeline to make sense.
                        artifacts_path=artifacts_path,
                        accelerator_options=pipeline_options.accelerator_options,
                        vlm_options=vlm_options,
                    ),
                ]
        self.enrichment_pipe = [
            # Other models working on `NodeItem` elements in the DoclingDocument
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@ -4,10 +4,11 @@ import logging
 from io import BytesIO
 from itertools import islice
 from pathlib import Path
-from typing import List, Union
+from typing import Dict, List, Optional, Union
 import requests
 from PIL import Image
 from pydantic import AnyUrl
 from tqdm import tqdm
 from docling.datamodel.base_models import OpenAiApiResponse
@ -75,10 +76,12 @@ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
 def openai_image_request(
    image: Image.Image,
    prompt: str,
-    url: str = "http://localhost:11434/v1/chat/completions",  # Default to ollama
+    url: Union[
-    apikey: str | None = None,
+        AnyUrl, str
    ] = "http://localhost:11434/v1/chat/completions",  # Default to ollama
    apikey: Optional[str] = None,
    timeout: float = 20,
-    headers: dict[str, str] | None = None,
+    headers: Optional[Dict[str, str]] = None,
    **params,
 ) -> str:
    img_io = BytesIO()
@ -90,9 +93,7 @@ def openai_image_request(
            "content": [
                {
                    "type": "image_url",
-                    "image_url": {
+                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                        "url": f"data:image/png;base64,{image_base64}"
                    },
                },
                {
                    "type": "text",