docling/examples/rag_langchain.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RAG with Docling and 🦜🔗 LangChain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "# requirements for this example:\n",
    "%pip install -qq docling docling-core python-dotenv langchain langchain-text-splitters langchain-huggingface langchain-milvus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
    "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")\n",
    "# https://github.com/huggingface/transformers/issues/5486:\n",
    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Helpers"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Below we set up:\n",
    "- a `Loader` which will be used to create LangChain documents,\n",
    "- a splitter, which will be used to split these documents, and\n",
    "- a helper function for QA printing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from enum import Enum\n",
    "from typing import Iterator\n",
    "\n",
    "from langchain_core.document_loaders import BaseLoader\n",
    "from langchain_core.documents import Document as LCDocument\n",
    "\n",
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "_KEY_DL_DOC_HASH = \"dl_doc_hash\"\n",
    "_KEY_ORIGIN = \"origin\"\n",
    "\n",
    "\n",
    "class DoclingPDFLoader(BaseLoader):\n",
    "    class ParseType(str, Enum):\n",
    "        MARKDOWN = \"markdown\"\n",
    "        JSON = \"json\"\n",
    "\n",
    "    include_origin: bool = False\n",
    "\n",
    "    def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n",
    "        self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
    "        self._parse_type = parse_type\n",
    "        self._converter = DocumentConverter()\n",
    "\n",
    "    def lazy_load(self) -> Iterator[LCDocument]:\n",
    "        for source in self._file_paths:\n",
    "            dl_doc = self._converter.convert_single(source).output\n",
    "            match self._parse_type:\n",
    "                case self.ParseType.MARKDOWN:\n",
    "                    text = dl_doc.export_to_markdown()\n",
    "                case self.ParseType.JSON:\n",
    "                    text = dl_doc.model_dump_json()\n",
    "                case _:\n",
    "                    raise RuntimeError(\n",
    "                        f\"Unexpected parse type encountered: {self._parse_type}\"\n",
    "                    )\n",
    "            metadata = {\n",
    "                _KEY_DL_DOC_HASH: dl_doc.file_info.document_hash,\n",
    "            }\n",
    "            if self.include_origin:\n",
    "                metadata[_KEY_ORIGIN] = source\n",
    "\n",
    "            lc_doc = LCDocument(\n",
    "                page_content=text,\n",
    "                metadata=metadata,\n",
    "            )\n",
    "            yield lc_doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from typing import Iterable, List\n",
    "\n",
    "from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker\n",
    "from docling_core.types import Document as DLDocument\n",
    "from langchain_core.documents import Document as LCDocument\n",
    "\n",
    "\n",
    "class DoclingSplitter:\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        chunker: BaseChunker | None = None,\n",
    "    ) -> None:\n",
    "        self.chunker: BaseChunker = chunker or HierarchicalChunker(\n",
    "            heading_as_metadata=True\n",
    "        )\n",
    "\n",
    "    def split_documents(self, documents: Iterable[LCDocument]) -> List[LCDocument]:\n",
    "\n",
    "        all_chunk_docs: list[LCDocument] = []\n",
    "        for doc in documents:\n",
    "            lc_doc: LCDocument = LCDocument.parse_obj(doc)\n",
    "            dl_doc: DLDocument = DLDocument.model_validate_json(lc_doc.page_content)\n",
    "            chunk_iter = self.chunker.chunk(dl_doc=dl_doc)\n",
    "            for chunk in chunk_iter:\n",
    "                chunk_metadata = chunk.model_dump(\n",
    "                    exclude=\"text\",\n",
    "                    exclude_none=True,\n",
    "                )\n",
    "                metadata = {**lc_doc.metadata, **chunk_metadata}\n",
    "                for k, v in metadata.items():\n",
    "                    if isinstance(v, Iterable) and not isinstance(v, str):\n",
    "                        metadata[k] = json.dumps(v)\n",
    "                chunk_doc = LCDocument(\n",
    "                    page_content=chunk.text,\n",
    "                    metadata=metadata,\n",
    "                )\n",
    "                all_chunk_docs.append(chunk_doc)\n",
    "\n",
    "        return all_chunk_docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_qa(resp_dict):\n",
    "    def clip(inp, max_len=100):\n",
    "        if isinstance(inp, str):\n",
    "            return f\"{inp[:max_len]}{'...' if len(inp) > max_len else ''}\"\n",
    "        else:\n",
    "            return inp\n",
    "\n",
    "    print(\n",
    "        f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{json.dumps(clip(resp_dict['answer']))}\"\n",
    "    )\n",
    "    for i, doc in enumerate(resp_dict[\"context\"]):\n",
    "        print()\n",
    "        print(f\"Source {i+1}:\")\n",
    "        print(f\"  text: {json.dumps(clip(doc.page_content))}\")\n",
    "        for key in doc.metadata:\n",
    "            if key != \"pk\":\n",
    "                print(f\"  {key}: {clip(doc.metadata.get(key))}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\"  # DocLayNet paper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loader and splitter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Using native Docling format (as JSON)**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To leverage Docling's rich document structure format, we can namely export to JSON and use the `DoclingSplitter` accordingly:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = DoclingPDFLoader(\n",
    "    file_path=FILE_PATH,\n",
    "    parse_type=DoclingPDFLoader.ParseType.JSON,\n",
    ")\n",
    "splitter = DoclingSplitter()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Using Markdown:**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "\n",
    "# loader = DoclingPDFLoader(\n",
    "#     file_path=FILE_PATH,\n",
    "#     parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n",
    "# )\n",
    "# splitter = RecursiveCharacterTextSplitter(\n",
    "#     chunk_size=1000,\n",
    "#     chunk_overlap=200,\n",
    "# )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We now used the above-defined objects to get the document splits:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = loader.load()\n",
    "splits = splitter.split_documents(docs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Embed model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
    "\n",
    "HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
    "embedding = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vector store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tempfile import TemporaryDirectory\n",
    "\n",
    "from langchain_milvus import Milvus\n",
    "\n",
    "MILVUS_URI = os.environ.get(\n",
    "    \"MILVUS_URI\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
    ")\n",
    "\n",
    "vectorstore = Milvus.from_documents(\n",
    "    splits,\n",
    "    embedding,\n",
    "    connection_args={\"uri\": MILVUS_URI},\n",
    "    collection_name=\"docling_lc_demo\",\n",
    "    drop_old=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### LLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
      "Token is valid (permission: write).\n",
      "Your token has been saved to /Users/pva/.cache/huggingface/token\n",
      "Login successful\n"
     ]
    }
   ],
   "source": [
    "from langchain_huggingface import HuggingFaceEndpoint\n",
    "\n",
    "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",
    "HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n",
    "\n",
    "llm = HuggingFaceEndpoint(\n",
    "    repo_id=HF_LLM_MODEL_ID,\n",
    "    huggingfacehub_api_token=HF_API_KEY,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RAG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.chains import create_retrieval_chain\n",
    "from langchain.chains.combine_documents import create_stuff_documents_chain\n",
    "from langchain_core.prompts import PromptTemplate\n",
    "\n",
    "retriever = vectorstore.as_retriever()\n",
    "prompt = PromptTemplate.from_template(\n",
    "    \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {input}\\nAnswer:\\n\"\n",
    ")\n",
    "question_answer_chain = create_stuff_documents_chain(llm, prompt)\n",
    "rag_chain = create_retrieval_chain(retriever, question_answer_chain)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question:\n",
      "How many pages were human annotated by humans for DocLayNet?\n",
      "\n",
      "Answer:\n",
      "\"80863 pages were annotated by humans in DocLayNet.\"\n",
      "\n",
      "Source 1:\n",
      "  text: \"DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ...\"\n",
      "  bbox: [317.2852478027344, 116.46983337402344, 559.7131958007812, 201.73675537109375]\n",
      "  dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
      "  heading: 3 THE DOCLAYNET DATASET\n",
      "  page: 2\n",
      "  path: $.main-text[37]\n",
      "\n",
      "Source 2:\n",
      "  text: \"In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-tr...\"\n",
      "  bbox: [53.50020980834961, 212.36782836914062, 295.56396484375, 286.4964599609375]\n",
      "  dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
      "  heading: 1 INTRODUCTION\n",
      "  page: 2\n",
      "  path: $.main-text[23]\n",
      "\n",
      "Source 3:\n",
      "  text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n",
      "  bbox: [53.60108947753906, 723.3781127929688, 347.139892578125, 731.6909790039062]\n",
      "  dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
      "  heading: REFERENCES\n",
      "  page: 9\n",
      "  path: $.main-text[133]\n",
      "\n",
      "Source 4:\n",
      "  text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n",
      "  bbox: [53.542964935302734, 723.3500366210938, 347.0172424316406, 731.6931762695312]\n",
      "  dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
      "  heading: 4 ANNOTATION CAMPAIGN\n",
      "  page: 5\n",
      "  path: $.main-text[64]\n"
     ]
    }
   ],
   "source": [
    "resp_dict = rag_chain.invoke(\n",
    "    {\"input\": \"How many pages were human annotated by humans for DocLayNet?\"}\n",
    ")\n",
    "print_qa(resp_dict=resp_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}