From f14c1b4f05b429355426adc54455e914032b1b02 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 9 Apr 2025 11:54:22 -0600 Subject: [PATCH] fix: Linting, formatting, and bug fixes The one bug fix was in the timeout arg to openai_image_request. Otherwise, this is all style changes to get MyPy and black passing cleanly. Branch: OllamaVlmModel Signed-off-by: Gabe Goodhart --- docling/cli/main.py | 4 +- docling/datamodel/base_models.py | 1 + docling/datamodel/pipeline_options.py | 12 ++--- docling/models/openai_vlm_model.py | 9 +++- .../models/picture_description_api_model.py | 2 +- docling/pipeline/vlm_pipeline.py | 45 ++++++++++--------- docling/utils/utils.py | 15 ++++--- 7 files changed, 50 insertions(+), 38 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index 71527b92..b0204151 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -537,7 +537,9 @@ def convert( if vlm_model == VlmModelType.GRANITE_VISION: pipeline_options.vlm_options = granite_vision_vlm_conversion_options elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA: - pipeline_options.vlm_options = granite_vision_vlm_ollama_conversion_options + pipeline_options.vlm_options = ( + granite_vision_vlm_ollama_conversion_options + ) elif vlm_model == VlmModelType.SMOLDOCLING: pipeline_options.vlm_options = smoldocling_vlm_conversion_options if sys.platform == "darwin": diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index ca879131..7dcf89c0 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -266,6 +266,7 @@ class Page(BaseModel): ## OpenAI API Request / Response Models ## + class OpenAiChatMessage(BaseModel): role: str content: str diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index aeb8f7b2..ddafa02c 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -289,8 +289,8 @@ class OpenAiVlmOptions(BaseVlmOptions): kind: Literal["openai_model_options"] = "openai_model_options" model_id: str - base_url: str = "http://localhost:11434/v1" # Default to ollama - apikey: str | None = None, + base_url: str = "http://localhost:11434/v1" # Default to ollama + apikey: Optional[str] = None scale: float = 2.0 timeout: float = 60 response_format: ResponseFormat @@ -322,8 +322,8 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( granite_vision_vlm_ollama_conversion_options = OpenAiVlmOptions( model_id="granite3.2-vision:2b", prompt="OCR the full page to markdown.", - scale = 1.0, - timeout = 120, + scale=1.0, + timeout=120, response_format=ResponseFormat.MARKDOWN, ) @@ -383,7 +383,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions): False # (To be used with vlms, or other generative models) ) # If True, text from backend will be used instead of generated text - vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options + vlm_options: Union[HuggingFaceVlmOptions, OpenAiVlmOptions] = ( + smoldocling_vlm_conversion_options + ) class PdfPipelineOptions(PaginatedPipelineOptions): diff --git a/docling/models/openai_vlm_model.py b/docling/models/openai_vlm_model.py index 4a1baa9d..eb7c45ce 100644 --- a/docling/models/openai_vlm_model.py +++ b/docling/models/openai_vlm_model.py @@ -18,11 +18,15 @@ class OpenAiVlmModel(BasePageModel): self.enabled = enabled self.vlm_options = vlm_options if self.enabled: - self.url = "/".join([self.vlm_options.base_url.rstrip("/"), "chat/completions"]) + self.url = "/".join( + [self.vlm_options.base_url.rstrip("/"), "chat/completions"] + ) self.apikey = self.vlm_options.apikey self.model_id = self.vlm_options.model_id self.timeout = self.vlm_options.timeout - self.prompt_content = f"This is a page from a document.\n{self.vlm_options.prompt}" + self.prompt_content = ( + f"This is a page from a document.\n{self.vlm_options.prompt}" + ) def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] @@ -36,6 +40,7 @@ class OpenAiVlmModel(BasePageModel): assert page.size is not None hi_res_image = page.get_image(scale=self.vlm_options.scale) + assert hi_res_image is not None if hi_res_image: if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py index 610a4f5b..9a9771ae 100644 --- a/docling/models/picture_description_api_model.py +++ b/docling/models/picture_description_api_model.py @@ -52,7 +52,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel): image=image, prompt=self.options.prompt, url=self.options.url, - timeout=self.options.headers, + timeout=self.options.timeout, headers=self.options.headers, **self.options.params, ) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 46d936dc..dcfd92a4 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -64,30 +64,31 @@ class VlmPipeline(PaginatedPipeline): self.build_pipe = [ OpenAiVlmModel( enabled=True, # must be always enabled for this pipeline to make sense. - vlm_options=self.pipeline_options.vlm_options, - ), - ] - elif ( - self.pipeline_options.vlm_options.inference_framework - == InferenceFramework.MLX - ): - self.build_pipe = [ - HuggingFaceMlxModel( - enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, - accelerator_options=pipeline_options.accelerator_options, - vlm_options=self.pipeline_options.vlm_options, - ), - ] - elif isinstance(pipeline_options.vlm_options, HuggingFaceVlmOptions): - self.build_pipe = [ - HuggingFaceVlmModel( - enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, - accelerator_options=pipeline_options.accelerator_options, - vlm_options=self.pipeline_options.vlm_options, + vlm_options=cast( + OpenAiVlmOptions, self.pipeline_options.vlm_options + ), ), ] + elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions): + vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options) + if vlm_options.inference_framework == InferenceFramework.MLX: + self.build_pipe = [ + HuggingFaceMlxModel( + enabled=True, # must be always enabled for this pipeline to make sense. + artifacts_path=artifacts_path, + accelerator_options=pipeline_options.accelerator_options, + vlm_options=vlm_options, + ), + ] + else: + self.build_pipe = [ + HuggingFaceVlmModel( + enabled=True, # must be always enabled for this pipeline to make sense. + artifacts_path=artifacts_path, + accelerator_options=pipeline_options.accelerator_options, + vlm_options=vlm_options, + ), + ] self.enrichment_pipe = [ # Other models working on `NodeItem` elements in the DoclingDocument diff --git a/docling/utils/utils.py b/docling/utils/utils.py index 98c3e692..ecc21baf 100644 --- a/docling/utils/utils.py +++ b/docling/utils/utils.py @@ -4,10 +4,11 @@ import logging from io import BytesIO from itertools import islice from pathlib import Path -from typing import List, Union +from typing import Dict, List, Optional, Union import requests from PIL import Image +from pydantic import AnyUrl from tqdm import tqdm from docling.datamodel.base_models import OpenAiApiResponse @@ -75,10 +76,12 @@ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO: def openai_image_request( image: Image.Image, prompt: str, - url: str = "http://localhost:11434/v1/chat/completions", # Default to ollama - apikey: str | None = None, + url: Union[ + AnyUrl, str + ] = "http://localhost:11434/v1/chat/completions", # Default to ollama + apikey: Optional[str] = None, timeout: float = 20, - headers: dict[str, str] | None = None, + headers: Optional[Dict[str, str]] = None, **params, ) -> str: img_io = BytesIO() @@ -90,9 +93,7 @@ def openai_image_request( "content": [ { "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{image_base64}" - }, + "image_url": {"url": f"data:image/png;base64,{image_base64}"}, }, { "type": "text",