docs: introduce docs site (#141)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-12-08 20:58:11 +00:00 · 2024-10-14 14:13:13 +02:00
parent 2b1e72d327
commit d504432c1e
25 changed files with 1324 additions and 574 deletions
--- a/docs/examples/rag_langchain.ipynb
+++ b/docs/examples/rag_langchain.ipynb
@@ -0,0 +1,369 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RAG with LangChain 🦜🔗"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# requirements for this example:\n",
+    "%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
+    "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Loader and splitter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below we set up:\n",
+    "- a `Loader` which will be used to create LangChain documents, and\n",
+    "- a splitter, which will be used to split these documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from enum import Enum\n",
+    "from typing import Iterator\n",
+    "\n",
+    "from langchain_core.document_loaders import BaseLoader\n",
+    "from langchain_core.documents import Document as LCDocument\n",
+    "from pydantic import BaseModel\n",
+    "\n",
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "\n",
+    "class DocumentMetadata(BaseModel):\n",
+    "    dl_doc_hash: str\n",
+    "    # source: str\n",
+    "\n",
+    "\n",
+    "class DoclingPDFLoader(BaseLoader):\n",
+    "    class ParseType(str, Enum):\n",
+    "        MARKDOWN = \"markdown\"\n",
+    "        # JSON = \"json\"\n",
+    "\n",
+    "    def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n",
+    "        self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
+    "        self._parse_type = parse_type\n",
+    "        self._converter = DocumentConverter()\n",
+    "\n",
+    "    def lazy_load(self) -> Iterator[LCDocument]:\n",
+    "        for source in self._file_paths:\n",
+    "            dl_doc = self._converter.convert_single(source).output\n",
+    "            match self._parse_type:\n",
+    "                case self.ParseType.MARKDOWN:\n",
+    "                    text = dl_doc.export_to_markdown()\n",
+    "                # case self.ParseType.JSON:\n",
+    "                #     text = dl_doc.model_dump_json()\n",
+    "                case _:\n",
+    "                    raise RuntimeError(\n",
+    "                        f\"Unexpected parse type encountered: {self._parse_type}\"\n",
+    "                    )\n",
+    "            lc_doc = LCDocument(\n",
+    "                page_content=text,\n",
+    "                metadata=DocumentMetadata(\n",
+    "                    dl_doc_hash=dl_doc.file_info.document_hash,\n",
+    "                ).model_dump(),\n",
+    "            )\n",
+    "            yield lc_doc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\"  # DocLayNet paper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1b38d07d5fed4618a44ecf261e1e5c44",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
+    "\n",
+    "loader = DoclingPDFLoader(\n",
+    "    file_path=FILE_PATH,\n",
+    "    parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n",
+    ")\n",
+    "text_splitter = RecursiveCharacterTextSplitter(\n",
+    "    chunk_size=1000,\n",
+    "    chunk_overlap=200,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We now used the above-defined objects to get the document splits:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()\n",
+    "splits = text_splitter.split_documents(docs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
+    "\n",
+    "HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
+    "embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Vector store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tempfile import TemporaryDirectory\n",
+    "\n",
+    "from langchain_milvus import Milvus\n",
+    "\n",
+    "MILVUS_URI = os.environ.get(\n",
+    "    \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
+    ")\n",
+    "\n",
+    "vectorstore = Milvus.from_documents(\n",
+    "    splits,\n",
+    "    embeddings,\n",
+    "    connection_args={\"uri\": MILVUS_URI},\n",
+    "    drop_old=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### LLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
+      "Token is valid (permission: write).\n",
+      "Your token has been saved to /Users/pva/.cache/huggingface/token\n",
+      "Login successful\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_huggingface import HuggingFaceEndpoint\n",
+    "\n",
+    "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",
+    "HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n",
+    "\n",
+    "llm = HuggingFaceEndpoint(\n",
+    "    repo_id=HF_LLM_MODEL_ID,\n",
+    "    huggingfacehub_api_token=HF_API_KEY,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## RAG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Iterable\n",
+    "\n",
+    "from langchain_core.documents import Document as LCDocument\n",
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "from langchain_core.prompts import PromptTemplate\n",
+    "from langchain_core.runnables import RunnablePassthrough\n",
+    "\n",
+    "\n",
+    "def format_docs(docs: Iterable[LCDocument]):\n",
+    "    return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
+    "\n",
+    "\n",
+    "retriever = vectorstore.as_retriever()\n",
+    "\n",
+    "prompt = PromptTemplate.from_template(\n",
+    "    \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n",
+    ")\n",
+    "\n",
+    "rag_chain = (\n",
+    "    {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
+    "    | prompt\n",
+    "    | llm\n",
+    "    | StrOutputParser()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The human annotation of DocLayNet was performed on 80863 pages.\\n\\nExplanation:\\nThe information is found in the paragraph \"DocLayNet contains 80863 PDF pages\" in the context.'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}