{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RAG with Docling and 🦜🔗 LangChain" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# requirements for this example:\n", "%pip install -qq docling docling-core python-dotenv langchain langchain-text-splitters langchain-huggingface langchain-milvus" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "\n", "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n", "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")\n", "# https://github.com/huggingface/transformers/issues/5486:\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loader and splitter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below we set up:\n", "- a `Loader` which will be used to create LangChain documents,\n", "- a splitter, which will be used to split these documents, and\n", "- a helper function for QA printing" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from enum import Enum\n", "from typing import Iterator\n", "\n", "from langchain_core.document_loaders import BaseLoader\n", "from langchain_core.documents import Document as LCDocument\n", "\n", "from docling.document_converter import DocumentConverter\n", "\n", "_KEY_DL_DOC_HASH = \"dl_doc_hash\"\n", "_KEY_ORIGIN = \"origin\"\n", "\n", "\n", "class DoclingPDFLoader(BaseLoader):\n", " class ParseType(str, Enum):\n", " MARKDOWN = \"markdown\"\n", " JSON = \"json\"\n", "\n", " include_origin: bool = False\n", "\n", " def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n", " self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n", " self._parse_type = parse_type\n", " self._converter = DocumentConverter()\n", "\n", " def lazy_load(self) -> Iterator[LCDocument]:\n", " for source in self._file_paths:\n", " dl_doc = self._converter.convert_single(source).output\n", " match self._parse_type:\n", " case self.ParseType.MARKDOWN:\n", " text = dl_doc.export_to_markdown()\n", " case self.ParseType.JSON:\n", " text = dl_doc.model_dump_json()\n", " case _:\n", " raise RuntimeError(\n", " f\"Unexpected parse type encountered: {self._parse_type}\"\n", " )\n", " metadata = {\n", " _KEY_DL_DOC_HASH: dl_doc.file_info.document_hash,\n", " }\n", " if self.include_origin:\n", " metadata[_KEY_ORIGIN] = source\n", "\n", " lc_doc = LCDocument(\n", " page_content=text,\n", " metadata=metadata,\n", " )\n", " yield lc_doc" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import json\n", "from typing import Iterable, List\n", "\n", "from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker\n", "from docling_core.types import Document as DLDocument\n", "from langchain_core.documents import Document as LCDocument\n", "\n", "\n", "class DoclingSplitter:\n", "\n", " def __init__(\n", " self,\n", " chunker: BaseChunker | None = None,\n", " ) -> None:\n", " self.chunker: BaseChunker = chunker or HierarchicalChunker(\n", " heading_as_metadata=True\n", " )\n", "\n", " def split_documents(self, documents: Iterable[LCDocument]) -> List[LCDocument]:\n", "\n", " all_chunk_docs: list[LCDocument] = []\n", " for doc in documents:\n", " lc_doc: LCDocument = LCDocument.parse_obj(doc)\n", " dl_doc: DLDocument = DLDocument.model_validate_json(lc_doc.page_content)\n", " chunk_iter = self.chunker.chunk(dl_doc=dl_doc)\n", " for chunk in chunk_iter:\n", " chunk_metadata = chunk.model_dump(\n", " exclude=\"text\",\n", " exclude_none=True,\n", " )\n", " metadata = {**lc_doc.metadata, **chunk_metadata}\n", " for k, v in metadata.items():\n", " if isinstance(v, Iterable) and not isinstance(v, str):\n", " metadata[k] = json.dumps(v)\n", " chunk_doc = LCDocument(\n", " page_content=chunk.text,\n", " metadata=metadata,\n", " )\n", " all_chunk_docs.append(chunk_doc)\n", "\n", " return all_chunk_docs" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def print_qa(resp_dict):\n", " def clip(inp, max_len=100):\n", " if isinstance(inp, str):\n", " return f\"{inp[:max_len]}{'...' if len(inp) > max_len else ''}\"\n", " else:\n", " return inp\n", "\n", " print(\n", " f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{json.dumps(clip(resp_dict['answer']))}\"\n", " )\n", " for i, doc in enumerate(resp_dict[\"context\"]):\n", " print()\n", " print(f\"Source {i+1}:\")\n", " print(f\" text: {json.dumps(clip(doc.page_content))}\")\n", " for key in doc.metadata:\n", " if key != \"pk\":\n", " print(f\" {key}: {clip(doc.metadata.get(key))}\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Using native Docling format (as JSON)**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To leverage Docling's rich document structure format, we can namely export to JSON and use the `DoclingSplitter` accordingly:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "loader = DoclingPDFLoader(\n", " file_path=FILE_PATH,\n", " parse_type=DoclingPDFLoader.ParseType.JSON,\n", ")\n", "splitter = DoclingSplitter()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Using Markdown:**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "\n", "# loader = DoclingPDFLoader(\n", "# file_path=FILE_PATH,\n", "# parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n", "# )\n", "# splitter = RecursiveCharacterTextSplitter(\n", "# chunk_size=1000,\n", "# chunk_overlap=200,\n", "# )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now used the above-defined objects to get the document splits:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "docs = loader.load()\n", "splits = splitter.split_documents(docs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Embed model" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n", "\n", "HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n", "embedding = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Vector store" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from tempfile import TemporaryDirectory\n", "\n", "from langchain_milvus import Milvus\n", "\n", "MILVUS_URI = os.environ.get(\n", " \"MILVUS_URI\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n", ")\n", "\n", "vectorstore = Milvus.from_documents(\n", " splits,\n", " embedding,\n", " connection_args={\"uri\": MILVUS_URI},\n", " collection_name=\"docling_lc_demo\",\n", " drop_old=True,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### LLM" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n", "Token is valid (permission: write).\n", "Your token has been saved to /Users/pva/.cache/huggingface/token\n", "Login successful\n" ] } ], "source": [ "from langchain_huggingface import HuggingFaceEndpoint\n", "\n", "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n", "HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n", "\n", "llm = HuggingFaceEndpoint(\n", " repo_id=HF_LLM_MODEL_ID,\n", " huggingfacehub_api_token=HF_API_KEY,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RAG" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from langchain.chains import create_retrieval_chain\n", "from langchain.chains.combine_documents import create_stuff_documents_chain\n", "from langchain_core.prompts import PromptTemplate\n", "\n", "retriever = vectorstore.as_retriever()\n", "prompt = PromptTemplate.from_template(\n", " \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {input}\\nAnswer:\\n\"\n", ")\n", "question_answer_chain = create_stuff_documents_chain(llm, prompt)\n", "rag_chain = create_retrieval_chain(retriever, question_answer_chain)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Question:\n", "How many pages were human annotated for DocLayNet?\n", "\n", "Answer:\n", "\"80863 pages were human annotated for DocLayNet.\\nExplanation:\\nFrom the context, it is clear that DocL...\"\n", "\n", "Source 1:\n", " text: \"DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ...\"\n", " bbox: [317.2852478027344, 116.46983337402344, 559.7131958007812, 201.73675537109375]\n", " dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n", " heading: 3 THE DOCLAYNET DATASET\n", " page: 2\n", " path: $.main-text[37]\n", "\n", "Source 2:\n", " text: \"In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-tr...\"\n", " bbox: [53.50020980834961, 212.36782836914062, 295.56396484375, 286.4964599609375]\n", " dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n", " heading: 1 INTRODUCTION\n", " page: 2\n", " path: $.main-text[23]\n", "\n", "Source 3:\n", " text: \"Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11...\"\n", " bbox: [317.3695373535156, 82.78482818603516, 559.7149047851562, 244.83221435546875]\n", " dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n", " heading: 4 ANNOTATION CAMPAIGN\n", " page: 5\n", " path: $.main-text[80]\n", "\n", "Source 4:\n", " text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n", " bbox: [53.60108947753906, 723.3781127929688, 347.139892578125, 731.6909790039062]\n", " dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n", " heading: REFERENCES\n", " page: 9\n", " path: $.main-text[133]\n" ] } ], "source": [ "resp_dict = rag_chain.invoke(\n", " {\"input\": \"How many pages were human annotated for DocLayNet?\"}\n", ")\n", "print_qa(resp_dict=resp_dict)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }