mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
docs: Add example with MongoDB
This commit is contained in:
70
docs/examples/rag_mongodb.ipynb
vendored
70
docs/examples/rag_mongodb.ipynb
vendored
@@ -68,7 +68,7 @@
|
||||
"import warnings\n",
|
||||
"\n",
|
||||
"warnings.filterwarnings(\"ignore\")\n",
|
||||
"logging.getLogger(\"pymongo\").setLevel(logging.ERROR)\n"
|
||||
"logging.getLogger(\"pymongo\").setLevel(logging.ERROR)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -185,8 +185,10 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"# Instantiate the doc converter\n",
|
||||
"doc_converter = DocumentConverter()\n",
|
||||
"\n",
|
||||
@@ -288,11 +290,11 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import voyageai\n",
|
||||
"\n",
|
||||
"# Voyage API key\n",
|
||||
"VOYAGE_API_KEY = \"**********************\"\n",
|
||||
"\n",
|
||||
"import voyageai\n",
|
||||
"\n",
|
||||
"# Initialize the VoyageAI client\n",
|
||||
"vo = voyageai.Client(VOYAGE_API_KEY)\n",
|
||||
"result = vo.contextualized_embed(inputs=[chunk_texts], model=\"voyage-context-3\")\n",
|
||||
@@ -326,7 +328,10 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Combine chunks with their embeddings\n",
|
||||
"chunk_data = [{\"text\": text, \"embedding\": emb} for text, emb in zip(chunk_texts, contextualized_chunk_embds)]"
|
||||
"chunk_data = [\n",
|
||||
" {\"text\": text, \"embedding\": emb}\n",
|
||||
" for text, emb in zip(chunk_texts, contextualized_chunk_embds)\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -362,7 +367,9 @@
|
||||
"# Insert to MongoDB\n",
|
||||
"from pymongo import MongoClient\n",
|
||||
"\n",
|
||||
"client = MongoClient(\"mongodb+srv://*******.mongodb.net/\") # Replace with your MongoDB connection string\n",
|
||||
"client = MongoClient(\n",
|
||||
" \"mongodb+srv://*******.mongodb.net/\"\n",
|
||||
") # Replace with your MongoDB connection string\n",
|
||||
"db = client[\"rag_db\"] # Database name\n",
|
||||
"collection = db[\"documents\"] # Collection name\n",
|
||||
"\n",
|
||||
@@ -405,15 +412,15 @@
|
||||
" \"type\": \"vector\",\n",
|
||||
" \"path\": \"embedding\",\n",
|
||||
" \"numDimensions\": 1024,\n",
|
||||
" \"similarity\": \"dotProduct\"\n",
|
||||
" \"similarity\": \"dotProduct\",\n",
|
||||
" }\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" name=\"vector_index\",\n",
|
||||
" type=\"vectorSearch\"\n",
|
||||
" type=\"vectorSearch\",\n",
|
||||
")\n",
|
||||
"result = collection.create_search_index(model=search_index_model)\n",
|
||||
"print(\"New search index named \" + result + \" is building.\")\n"
|
||||
"print(\"New search index named \" + result + \" is building.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -524,21 +531,24 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"from openai import AzureOpenAI\n",
|
||||
"from rich.console import Console\n",
|
||||
"from rich.panel import Panel\n",
|
||||
"from openai import AzureOpenAI\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# Create MongoDB vector search query for \"Attention is All You Need\"\n",
|
||||
"# (prompt already defined above, reuse if present; else keep this definition)\n",
|
||||
"prompt = \"Give me top 3 learning points from `Attention is All You Need`, using only the retrieved context.\"\n",
|
||||
"\n",
|
||||
"# Generate embedding for the query using VoyageAI (vo already initialized earlier)\n",
|
||||
"query_embd_context = vo.contextualized_embed(\n",
|
||||
" inputs=[[prompt]],\n",
|
||||
" model=\"voyage-context-3\",\n",
|
||||
" input_type=\"query\"\n",
|
||||
").results[0].embeddings[0]\n",
|
||||
"query_embd_context = (\n",
|
||||
" vo.contextualized_embed(\n",
|
||||
" inputs=[[prompt]], model=\"voyage-context-3\", input_type=\"query\"\n",
|
||||
" )\n",
|
||||
" .results[0]\n",
|
||||
" .embeddings[0]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Vector search pipeline\n",
|
||||
"search_pipeline = [\n",
|
||||
@@ -548,21 +558,17 @@
|
||||
" \"path\": \"embedding\",\n",
|
||||
" \"queryVector\": query_embd_context,\n",
|
||||
" \"numCandidates\": 10,\n",
|
||||
" \"limit\": 10\n",
|
||||
" \"limit\": 10,\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"$project\": {\n",
|
||||
" \"text\": 1,\n",
|
||||
" \"_id\": 0,\n",
|
||||
" \"score\": {\"$meta\": \"vectorSearchScore\"}\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" {\"$project\": {\"text\": 1, \"_id\": 0, \"score\": {\"$meta\": \"vectorSearchScore\"}}},\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"results = list(collection.aggregate(search_pipeline))\n",
|
||||
"if not results:\n",
|
||||
" raise ValueError(\"No vector search results returned. Verify the index is built before querying.\")\n",
|
||||
" raise ValueError(\n",
|
||||
" \"No vector search results returned. Verify the index is built before querying.\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"context_texts = [doc[\"text\"] for doc in results]\n",
|
||||
"combined_context = \"\\n\\n\".join(context_texts)\n",
|
||||
@@ -579,7 +585,7 @@
|
||||
"client = AzureOpenAI(\n",
|
||||
" api_key=AZURE_OPENAI_API_KEY,\n",
|
||||
" azure_endpoint=AZURE_OPENAI_ENDPOINT.rstrip(\"/\"),\n",
|
||||
" api_version=AZURE_OPENAI_API_VERSION\n",
|
||||
" api_version=AZURE_OPENAI_API_VERSION,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Chat completion using retrieved context\n",
|
||||
@@ -588,21 +594,23 @@
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": \"You are a helpful assistant. Use only the provided context to answer questions. If the context is insufficient, say so.\"\n",
|
||||
" \"content\": \"You are a helpful assistant. Use only the provided context to answer questions. If the context is insufficient, say so.\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": f\"Context:\\n{combined_context}\\n\\nQuestion: {prompt}\"\n",
|
||||
" }\n",
|
||||
" \"content\": f\"Context:\\n{combined_context}\\n\\nQuestion: {prompt}\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" temperature=0.2\n",
|
||||
" temperature=0.2,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response_text = response.choices[0].message.content\n",
|
||||
"\n",
|
||||
"console = Console()\n",
|
||||
"console.print(Panel(f\"{prompt}\", title=\"Prompt\", border_style=\"bold red\"))\n",
|
||||
"console.print(Panel(response_text, title=\"Generated Content\", border_style=\"bold green\"))"
|
||||
"console.print(\n",
|
||||
" Panel(response_text, title=\"Generated Content\", border_style=\"bold green\")\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -123,6 +123,7 @@ nav:
|
||||
- examples/rag_opensearch.ipynb
|
||||
- examples/rag_weaviate.ipynb
|
||||
- examples/retrieval_qdrant.ipynb
|
||||
- examples/rag_mongodb.ipynb
|
||||
- Integrations:
|
||||
- Integrations: integrations/index.md
|
||||
- 🤖 Agentic / AI dev frameworks:
|
||||
|
||||
Reference in New Issue
Block a user