rename and refactor

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-04-10 11:33:00 +02:00 · 2025-04-10 11:33:00 +02:00 · f77c8cf96c
commit f77c8cf96c
parent 0f438b3a76
6 changed files with 81 additions and 77 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -285,10 +285,12 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
        return self.repo_id.replace("/", "--")


-class OpenAiVlmOptions(BaseVlmOptions):
-    kind: Literal["openai_model_options"] = "openai_model_options"
+class ApiVlmOptions(BaseVlmOptions):
+    kind: Literal["api_model_options"] = "api_model_options"

-    url: AnyUrl = AnyUrl("http://localhost:11434/v1/chat/completions")  # Default to ollama
+    url: AnyUrl = AnyUrl(
+        "http://localhost:11434/v1/chat/completions"
+    )  # Default to ollama
    headers: Dict[str, str] = {}
    params: Dict[str, Any] = {}
    scale: float = 2.0
@ -319,7 +321,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
    inference_framework=InferenceFramework.TRANSFORMERS,
 )

-granite_vision_vlm_ollama_conversion_options = OpenAiVlmOptions(
+granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
    url=AnyUrl("http://localhost:11434/v1/chat/completions"),
    params={"model": "granite3.2-vision:2b"},
    prompt="OCR the full page to markdown.",
@ -384,7 +386,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
        False  # (To be used with vlms, or other generative models)
    )
    # If True, text from backend will be used instead of generated text
-    vlm_options: Union[HuggingFaceVlmOptions, OpenAiVlmOptions] = (
+    vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
        smoldocling_vlm_conversion_options
    )

--- a/docling/models/openai_vlm_model.py
+++ b/docling/models/openai_vlm_model.py
@ -2,18 +2,18 @@ from typing import Iterable

 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import OpenAiVlmOptions
+from docling.datamodel.pipeline_options import ApiVlmOptions
 from docling.models.base_model import BasePageModel
+from docling.utils.api_image_request import api_image_request
 from docling.utils.profiling import TimeRecorder
-from docling.utils.utils import openai_image_request


-class OpenAiVlmModel(BasePageModel):
+class ApiVlmModel(BasePageModel):

    def __init__(
        self,
        enabled: bool,
-        vlm_options: OpenAiVlmOptions,
+        vlm_options: ApiVlmOptions,
    ):
        self.enabled = enabled
        self.vlm_options = vlm_options
@ -44,7 +44,7 @@ class OpenAiVlmModel(BasePageModel):
                        if hi_res_image.mode != "RGB":
                            hi_res_image = hi_res_image.convert("RGB")

-                    page_tags = openai_image_request(
+                    page_tags = api_image_request(
                        image=hi_res_image,
                        prompt=self.prompt_content,
                        url=self.vlm_options.url,
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.exceptions import OperationNotAllowed
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel
-from docling.utils.utils import openai_image_request
+from docling.utils.api_image_request import api_image_request


 class PictureDescriptionApiModel(PictureDescriptionBaseModel):
@ -48,7 +48,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
        # Note: technically we could make a batch request here,
        # but not all APIs will allow for it. For example, vllm won't allow more than 1.
        for image in images:
-            yield openai_image_request(
+            yield api_image_request(
                image=image,
                prompt=self.options.prompt,
                url=self.options.url,
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -15,16 +15,16 @@ from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import (
+    ApiVlmOptions,
    HuggingFaceVlmOptions,
    InferenceFramework,
-    OpenAiVlmOptions,
    ResponseFormat,
    VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
+from docling.models.api_vlm_model import ApiVlmModel
 from docling.models.hf_mlx_model import HuggingFaceMlxModel
 from docling.models.hf_vlm_model import HuggingFaceVlmModel
-from docling.models.openai_vlm_model import OpenAiVlmModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder

@ -60,13 +60,11 @@ class VlmPipeline(PaginatedPipeline):

        self.keep_images = self.pipeline_options.generate_page_images

-        if isinstance(pipeline_options.vlm_options, OpenAiVlmOptions):
+        if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
            self.build_pipe = [
-                OpenAiVlmModel(
+                ApiVlmModel(
                    enabled=True,  # must be always enabled for this pipeline to make sense.
-                    vlm_options=cast(
-                        OpenAiVlmOptions, self.pipeline_options.vlm_options
-                    ),
+                    vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
                ),
            ]
        elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
--- a/docling/utils/api_image_request.py
+++ b/docling/utils/api_image_request.py
@ -0,0 +1,61 @@
+import base64
+import logging
+from io import BytesIO
+from typing import Dict, Optional
+
+import requests
+from PIL import Image
+from pydantic import AnyUrl
+
+from docling.datamodel.base_models import OpenAiApiResponse
+
+_log = logging.getLogger(__name__)
+
+
+def api_image_request(
+    image: Image.Image,
+    prompt: str,
+    url: AnyUrl,
+    timeout: float = 20,
+    headers: Optional[Dict[str, str]] = None,
+    **params,
+) -> str:
+    img_io = BytesIO()
+    image.save(img_io, "PNG")
+    image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+                },
+                {
+                    "type": "text",
+                    "text": prompt,
+                },
+            ],
+        }
+    ]
+
+    payload = {
+        "messages": messages,
+        **params,
+    }
+
+    headers = headers or {}
+
+    r = requests.post(
+        str(url),
+        headers=headers,
+        json=payload,
+        timeout=timeout,
+    )
+    if not r.ok:
+        _log.error(f"Error calling the API. Response was {r.text}")
+    r.raise_for_status()
+
+    api_resp = OpenAiApiResponse.model_validate_json(r.text)
+    generated_text = api_resp.choices[0].message.content.strip()
+    return generated_text
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@ -1,20 +1,12 @@
-import base64
 import hashlib
-import logging
 from io import BytesIO
 from itertools import islice
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import List, Union

 import requests
-from PIL import Image
-from pydantic import AnyUrl
 from tqdm import tqdm

-from docling.datamodel.base_models import OpenAiApiResponse
-
-_log = logging.getLogger(__name__)
-

 def chunkify(iterator, chunk_size):
    """Yield successive chunks of chunk_size from the iterable."""
@ -71,52 +63,3 @@ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:

    buf.seek(0)
    return buf
-
-
-def openai_image_request(
-    image: Image.Image,
-    prompt: str,
-    url: AnyUrl,
-    timeout: float = 20,
-    headers: Optional[Dict[str, str]] = None,
-    **params,
-) -> str:
-    img_io = BytesIO()
-    image.save(img_io, "PNG")
-    image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
-                },
-                {
-                    "type": "text",
-                    "text": prompt,
-                },
-            ],
-        }
-    ]
-
-    payload = {
-        "messages": messages,
-        **params,
-    }
-
-    headers = headers or {}
-
-    r = requests.post(
-        str(url),
-        headers=headers,
-        json=payload,
-        timeout=timeout,
-    )
-    if not r.ok:
-        _log.error(f"Error calling the API. Response was {r.text}")
-    r.raise_for_status()
-
-    api_resp = OpenAiApiResponse.model_validate_json(r.text)
-    generated_text = api_resp.choices[0].message.content.strip()
-    return generated_text