mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
rename and refactor
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
0f438b3a76
commit
f77c8cf96c
@ -285,10 +285,12 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|||||||
return self.repo_id.replace("/", "--")
|
return self.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
class OpenAiVlmOptions(BaseVlmOptions):
|
class ApiVlmOptions(BaseVlmOptions):
|
||||||
kind: Literal["openai_model_options"] = "openai_model_options"
|
kind: Literal["api_model_options"] = "api_model_options"
|
||||||
|
|
||||||
url: AnyUrl = AnyUrl("http://localhost:11434/v1/chat/completions") # Default to ollama
|
url: AnyUrl = AnyUrl(
|
||||||
|
"http://localhost:11434/v1/chat/completions"
|
||||||
|
) # Default to ollama
|
||||||
headers: Dict[str, str] = {}
|
headers: Dict[str, str] = {}
|
||||||
params: Dict[str, Any] = {}
|
params: Dict[str, Any] = {}
|
||||||
scale: float = 2.0
|
scale: float = 2.0
|
||||||
@ -319,7 +321,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|||||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
)
|
)
|
||||||
|
|
||||||
granite_vision_vlm_ollama_conversion_options = OpenAiVlmOptions(
|
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
||||||
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
||||||
params={"model": "granite3.2-vision:2b"},
|
params={"model": "granite3.2-vision:2b"},
|
||||||
prompt="OCR the full page to markdown.",
|
prompt="OCR the full page to markdown.",
|
||||||
@ -384,7 +386,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|||||||
False # (To be used with vlms, or other generative models)
|
False # (To be used with vlms, or other generative models)
|
||||||
)
|
)
|
||||||
# If True, text from backend will be used instead of generated text
|
# If True, text from backend will be used instead of generated text
|
||||||
vlm_options: Union[HuggingFaceVlmOptions, OpenAiVlmOptions] = (
|
vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
|
||||||
smoldocling_vlm_conversion_options
|
smoldocling_vlm_conversion_options
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -2,18 +2,18 @@ from typing import Iterable
|
|||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import OpenAiVlmOptions
|
from docling.datamodel.pipeline_options import ApiVlmOptions
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
|
from docling.utils.api_image_request import api_image_request
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
from docling.utils.utils import openai_image_request
|
|
||||||
|
|
||||||
|
|
||||||
class OpenAiVlmModel(BasePageModel):
|
class ApiVlmModel(BasePageModel):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
vlm_options: OpenAiVlmOptions,
|
vlm_options: ApiVlmOptions,
|
||||||
):
|
):
|
||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
self.vlm_options = vlm_options
|
self.vlm_options = vlm_options
|
||||||
@ -44,7 +44,7 @@ class OpenAiVlmModel(BasePageModel):
|
|||||||
if hi_res_image.mode != "RGB":
|
if hi_res_image.mode != "RGB":
|
||||||
hi_res_image = hi_res_image.convert("RGB")
|
hi_res_image = hi_res_image.convert("RGB")
|
||||||
|
|
||||||
page_tags = openai_image_request(
|
page_tags = api_image_request(
|
||||||
image=hi_res_image,
|
image=hi_res_image,
|
||||||
prompt=self.prompt_content,
|
prompt=self.prompt_content,
|
||||||
url=self.vlm_options.url,
|
url=self.vlm_options.url,
|
@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
)
|
)
|
||||||
from docling.exceptions import OperationNotAllowed
|
from docling.exceptions import OperationNotAllowed
|
||||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||||
from docling.utils.utils import openai_image_request
|
from docling.utils.api_image_request import api_image_request
|
||||||
|
|
||||||
|
|
||||||
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||||
@ -48,7 +48,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|||||||
# Note: technically we could make a batch request here,
|
# Note: technically we could make a batch request here,
|
||||||
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
||||||
for image in images:
|
for image in images:
|
||||||
yield openai_image_request(
|
yield api_image_request(
|
||||||
image=image,
|
image=image,
|
||||||
prompt=self.options.prompt,
|
prompt=self.options.prompt,
|
||||||
url=self.options.url,
|
url=self.options.url,
|
||||||
|
@ -15,16 +15,16 @@ from docling.backend.pdf_backend import PdfDocumentBackend
|
|||||||
from docling.datamodel.base_models import InputFormat, Page
|
from docling.datamodel.base_models import InputFormat, Page
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
|
ApiVlmOptions,
|
||||||
HuggingFaceVlmOptions,
|
HuggingFaceVlmOptions,
|
||||||
InferenceFramework,
|
InferenceFramework,
|
||||||
OpenAiVlmOptions,
|
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.models.api_vlm_model import ApiVlmModel
|
||||||
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
||||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||||
from docling.models.openai_vlm_model import OpenAiVlmModel
|
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
@ -60,13 +60,11 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
self.keep_images = self.pipeline_options.generate_page_images
|
self.keep_images = self.pipeline_options.generate_page_images
|
||||||
|
|
||||||
if isinstance(pipeline_options.vlm_options, OpenAiVlmOptions):
|
if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
|
||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
OpenAiVlmModel(
|
ApiVlmModel(
|
||||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
vlm_options=cast(
|
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
|
||||||
OpenAiVlmOptions, self.pipeline_options.vlm_options
|
|
||||||
),
|
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
|
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
|
||||||
|
61
docling/utils/api_image_request.py
Normal file
61
docling/utils/api_image_request.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from PIL import Image
|
||||||
|
from pydantic import AnyUrl
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import OpenAiApiResponse
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def api_image_request(
|
||||||
|
image: Image.Image,
|
||||||
|
prompt: str,
|
||||||
|
url: AnyUrl,
|
||||||
|
timeout: float = 20,
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
**params,
|
||||||
|
) -> str:
|
||||||
|
img_io = BytesIO()
|
||||||
|
image.save(img_io, "PNG")
|
||||||
|
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"messages": messages,
|
||||||
|
**params,
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = headers or {}
|
||||||
|
|
||||||
|
r = requests.post(
|
||||||
|
str(url),
|
||||||
|
headers=headers,
|
||||||
|
json=payload,
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
if not r.ok:
|
||||||
|
_log.error(f"Error calling the API. Response was {r.text}")
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
||||||
|
generated_text = api_resp.choices[0].message.content.strip()
|
||||||
|
return generated_text
|
@ -1,20 +1,12 @@
|
|||||||
import base64
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Union
|
from typing import List, Union
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from PIL import Image
|
|
||||||
from pydantic import AnyUrl
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from docling.datamodel.base_models import OpenAiApiResponse
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def chunkify(iterator, chunk_size):
|
def chunkify(iterator, chunk_size):
|
||||||
"""Yield successive chunks of chunk_size from the iterable."""
|
"""Yield successive chunks of chunk_size from the iterable."""
|
||||||
@ -71,52 +63,3 @@ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
|
|||||||
|
|
||||||
buf.seek(0)
|
buf.seek(0)
|
||||||
return buf
|
return buf
|
||||||
|
|
||||||
|
|
||||||
def openai_image_request(
|
|
||||||
image: Image.Image,
|
|
||||||
prompt: str,
|
|
||||||
url: AnyUrl,
|
|
||||||
timeout: float = 20,
|
|
||||||
headers: Optional[Dict[str, str]] = None,
|
|
||||||
**params,
|
|
||||||
) -> str:
|
|
||||||
img_io = BytesIO()
|
|
||||||
image.save(img_io, "PNG")
|
|
||||||
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "image_url",
|
|
||||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": prompt,
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"messages": messages,
|
|
||||||
**params,
|
|
||||||
}
|
|
||||||
|
|
||||||
headers = headers or {}
|
|
||||||
|
|
||||||
r = requests.post(
|
|
||||||
str(url),
|
|
||||||
headers=headers,
|
|
||||||
json=payload,
|
|
||||||
timeout=timeout,
|
|
||||||
)
|
|
||||||
if not r.ok:
|
|
||||||
_log.error(f"Error calling the API. Response was {r.text}")
|
|
||||||
r.raise_for_status()
|
|
||||||
|
|
||||||
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
|
||||||
generated_text = api_resp.choices[0].message.content.strip()
|
|
||||||
return generated_text
|
|
||||||
|
Loading…
Reference in New Issue
Block a user