diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py new file mode 100644 index 00000000..bc71af38 --- /dev/null +++ b/docs/examples/vlm_pipeline_api_model.py @@ -0,0 +1,111 @@ +import logging +import os +from pathlib import Path + +import requests +from dotenv import load_dotenv + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + ApiVlmOptions, + ResponseFormat, + VlmPipelineOptions, + granite_vision_vlm_ollama_conversion_options, +) +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.pipeline.vlm_pipeline import VlmPipeline + + +def ollama_vlm_options(model: str, prompt: str): + options = ApiVlmOptions( + url="http://localhost:11434/v1/chat/completions", # the default Ollama endpoint + params=dict( + model=model, + ), + prompt=prompt, + timeout=90, + scale=1.0, + response_format=ResponseFormat.MARKDOWN, + ) + return options + + +def watsonx_vlm_options(model: str, prompt: str): + load_dotenv() + api_key = os.environ.get("WX_API_KEY") + project_id = os.environ.get("WX_PROJECT_ID") + + def _get_iam_access_token(api_key: str) -> str: + res = requests.post( + url="https://iam.cloud.ibm.com/identity/token", + headers={ + "Content-Type": "application/x-www-form-urlencoded", + }, + data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}", + ) + res.raise_for_status() + api_out = res.json() + print(f"{api_out=}") + return api_out["access_token"] + + options = ApiVlmOptions( + url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29", + params=dict( + model_id=model, + project_id=project_id, + parameters=dict( + max_new_tokens=400, + ), + ), + headers={ + "Authorization": "Bearer " + _get_iam_access_token(api_key=api_key), + }, + prompt=prompt, + timeout=60, + response_format=ResponseFormat.MARKDOWN, + ) + return options + + +def main(): + logging.basicConfig(level=logging.INFO) + + # input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") + + pipeline_options = VlmPipelineOptions( + # enable_remote_services=True # <-- this is required! + ) + + # The ApiVlmOptions() allows to interface with APIs supporting + # the multi-modal chat interface. Here follow a few example on how to configure those. + + # One possibility is self-hosting model, e.g. via Ollama. + # Example using the Granite Vision model: (uncomment the following lines) + pipeline_options.vlm_options = ollama_vlm_options( + model="granite3.2-vision:2b", + prompt="OCR the full page to markdown.", + ) + + # Another possibility is using online services, e.g. watsonx.ai. + # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID. + # Uncomment the following line for this option: + # pipeline_options.vlm_options = watsonx_vlm_options( + # model="ibm/granite-vision-3-2-2b", prompt="OCR the full page to markdown." + # ) + + # Create the DocumentConverter and launch the conversion. + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + pipeline_cls=VlmPipeline, + ) + } + ) + result = doc_converter.convert(input_doc_path) + print(result.document.export_to_markdown()) + + +if __name__ == "__main__": + main()