mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
rename and refactor
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
0f438b3a76
commit
f77c8cf96c
@ -285,10 +285,12 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
||||
return self.repo_id.replace("/", "--")
|
||||
|
||||
|
||||
class OpenAiVlmOptions(BaseVlmOptions):
|
||||
kind: Literal["openai_model_options"] = "openai_model_options"
|
||||
class ApiVlmOptions(BaseVlmOptions):
|
||||
kind: Literal["api_model_options"] = "api_model_options"
|
||||
|
||||
url: AnyUrl = AnyUrl("http://localhost:11434/v1/chat/completions") # Default to ollama
|
||||
url: AnyUrl = AnyUrl(
|
||||
"http://localhost:11434/v1/chat/completions"
|
||||
) # Default to ollama
|
||||
headers: Dict[str, str] = {}
|
||||
params: Dict[str, Any] = {}
|
||||
scale: float = 2.0
|
||||
@ -319,7 +321,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||
)
|
||||
|
||||
granite_vision_vlm_ollama_conversion_options = OpenAiVlmOptions(
|
||||
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
||||
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
||||
params={"model": "granite3.2-vision:2b"},
|
||||
prompt="OCR the full page to markdown.",
|
||||
@ -384,7 +386,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||
False # (To be used with vlms, or other generative models)
|
||||
)
|
||||
# If True, text from backend will be used instead of generated text
|
||||
vlm_options: Union[HuggingFaceVlmOptions, OpenAiVlmOptions] = (
|
||||
vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
|
||||
smoldocling_vlm_conversion_options
|
||||
)
|
||||
|
||||
|
@ -2,18 +2,18 @@ from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OpenAiVlmOptions
|
||||
from docling.datamodel.pipeline_options import ApiVlmOptions
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.api_image_request import api_image_request
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docling.utils.utils import openai_image_request
|
||||
|
||||
|
||||
class OpenAiVlmModel(BasePageModel):
|
||||
class ApiVlmModel(BasePageModel):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
vlm_options: OpenAiVlmOptions,
|
||||
vlm_options: ApiVlmOptions,
|
||||
):
|
||||
self.enabled = enabled
|
||||
self.vlm_options = vlm_options
|
||||
@ -44,7 +44,7 @@ class OpenAiVlmModel(BasePageModel):
|
||||
if hi_res_image.mode != "RGB":
|
||||
hi_res_image = hi_res_image.convert("RGB")
|
||||
|
||||
page_tags = openai_image_request(
|
||||
page_tags = api_image_request(
|
||||
image=hi_res_image,
|
||||
prompt=self.prompt_content,
|
||||
url=self.vlm_options.url,
|
@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import (
|
||||
)
|
||||
from docling.exceptions import OperationNotAllowed
|
||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||
from docling.utils.utils import openai_image_request
|
||||
from docling.utils.api_image_request import api_image_request
|
||||
|
||||
|
||||
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||
@ -48,7 +48,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||
# Note: technically we could make a batch request here,
|
||||
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
||||
for image in images:
|
||||
yield openai_image_request(
|
||||
yield api_image_request(
|
||||
image=image,
|
||||
prompt=self.options.prompt,
|
||||
url=self.options.url,
|
||||
|
@ -15,16 +15,16 @@ from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat, Page
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import (
|
||||
ApiVlmOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
InferenceFramework,
|
||||
OpenAiVlmOptions,
|
||||
ResponseFormat,
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.api_vlm_model import ApiVlmModel
|
||||
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||
from docling.models.openai_vlm_model import OpenAiVlmModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
@ -60,13 +60,11 @@ class VlmPipeline(PaginatedPipeline):
|
||||
|
||||
self.keep_images = self.pipeline_options.generate_page_images
|
||||
|
||||
if isinstance(pipeline_options.vlm_options, OpenAiVlmOptions):
|
||||
if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
|
||||
self.build_pipe = [
|
||||
OpenAiVlmModel(
|
||||
ApiVlmModel(
|
||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||
vlm_options=cast(
|
||||
OpenAiVlmOptions, self.pipeline_options.vlm_options
|
||||
),
|
||||
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
|
||||
),
|
||||
]
|
||||
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
|
||||
|
61
docling/utils/api_image_request.py
Normal file
61
docling/utils/api_image_request.py
Normal file
@ -0,0 +1,61 @@
|
||||
import base64
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from typing import Dict, Optional
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
from pydantic import AnyUrl
|
||||
|
||||
from docling.datamodel.base_models import OpenAiApiResponse
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def api_image_request(
|
||||
image: Image.Image,
|
||||
prompt: str,
|
||||
url: AnyUrl,
|
||||
timeout: float = 20,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
**params,
|
||||
) -> str:
|
||||
img_io = BytesIO()
|
||||
image.save(img_io, "PNG")
|
||||
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
payload = {
|
||||
"messages": messages,
|
||||
**params,
|
||||
}
|
||||
|
||||
headers = headers or {}
|
||||
|
||||
r = requests.post(
|
||||
str(url),
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=timeout,
|
||||
)
|
||||
if not r.ok:
|
||||
_log.error(f"Error calling the API. Response was {r.text}")
|
||||
r.raise_for_status()
|
||||
|
||||
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
||||
generated_text = api_resp.choices[0].message.content.strip()
|
||||
return generated_text
|
@ -1,20 +1,12 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Union
|
||||
from typing import List, Union
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
from pydantic import AnyUrl
|
||||
from tqdm import tqdm
|
||||
|
||||
from docling.datamodel.base_models import OpenAiApiResponse
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def chunkify(iterator, chunk_size):
|
||||
"""Yield successive chunks of chunk_size from the iterable."""
|
||||
@ -71,52 +63,3 @@ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
|
||||
|
||||
buf.seek(0)
|
||||
return buf
|
||||
|
||||
|
||||
def openai_image_request(
|
||||
image: Image.Image,
|
||||
prompt: str,
|
||||
url: AnyUrl,
|
||||
timeout: float = 20,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
**params,
|
||||
) -> str:
|
||||
img_io = BytesIO()
|
||||
image.save(img_io, "PNG")
|
||||
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
payload = {
|
||||
"messages": messages,
|
||||
**params,
|
||||
}
|
||||
|
||||
headers = headers or {}
|
||||
|
||||
r = requests.post(
|
||||
str(url),
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=timeout,
|
||||
)
|
||||
if not r.ok:
|
||||
_log.error(f"Error calling the API. Response was {r.text}")
|
||||
r.raise_for_status()
|
||||
|
||||
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
||||
generated_text = api_resp.choices[0].message.content.strip()
|
||||
return generated_text
|
||||
|
Loading…
Reference in New Issue
Block a user