diff --git a/docs/examples/pictures_description.ipynb b/docs/examples/pictures_description.ipynb index 33e94e01..f50860db 100644 --- a/docs/examples/pictures_description.ipynb +++ b/docs/examples/pictures_description.ipynb @@ -175,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -265,6 +265,52 @@ "display.HTML(\"
\".join(html_buffer))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use other vision models\n", + "\n", + "The examples above can also be reproduced using other vision model.\n", + "The Docling options `PictureDescriptionVlmOptions` allows to speficy your favorite vision model from the Hugging Face Hub." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from docling.datamodel.pipeline_options import PictureDescriptionVlmOptions\n", + "\n", + "pipeline_options = PdfPipelineOptions()\n", + "pipeline_options.do_picture_description = True\n", + "pipeline_options.picture_description_options = PictureDescriptionVlmOptions(\n", + " repo_id=\"\", # <-- add here the Hugging Face repo_id of your favorite VLM\n", + " prompt=\"Describe the image in three sentences. Be consise and accurate.\",\n", + ")\n", + "pipeline_options.images_scale = 2.0\n", + "pipeline_options.generate_picture_images = True\n", + "\n", + "converter = DocumentConverter(\n", + " format_options={\n", + " InputFormat.PDF: PdfFormatOption(\n", + " pipeline_options=pipeline_options,\n", + " )\n", + " }\n", + ")\n", + "\n", + "# Uncomment to run:\n", + "# doc = converter.convert(DOC_SOURCE).document" + ] + }, { "cell_type": "code", "execution_count": null,