{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RAG with Docling and 🦙 LlamaIndex" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# requirements for this example:\n", "%pip install -qq docling docling-core python-dotenv llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "\n", "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n", "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")\n", "# https://github.com/huggingface/transformers/issues/5486:\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Helpers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below we define:\n", "\n", "- `DoclingPDFReader` which will be used to create LlamaIndex documents,\n", "- `DoclingNodeParser`, which can be used to create LlamaIndex nodes out of JSON-based documents, and\n", "- a helper function for QA printing" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from enum import Enum\n", "from pathlib import Path\n", "from typing import Any, Iterable\n", "\n", "from llama_index.core.readers.base import BasePydanticReader\n", "from llama_index.core.schema import Document as LIDocument\n", "\n", "from docling.document_converter import DocumentConverter\n", "\n", "_KEY_DL_DOC_HASH = \"dl_doc_hash\"\n", "_KEY_ORIGIN = \"origin\"\n", "\n", "\n", "class DoclingPDFReader(BasePydanticReader):\n", " class ParseType(str, Enum):\n", " MARKDOWN = \"markdown\"\n", " JSON = \"json\"\n", "\n", " parse_type: ParseType = ParseType.MARKDOWN\n", " include_origin: bool = False\n", "\n", " def lazy_load_data(\n", " self,\n", " file_path: str | Path | Iterable[str] | Iterable[Path],\n", " *args: Any,\n", " **load_kwargs: Any,\n", " ) -> Iterable[LIDocument]:\n", " file_paths = (\n", " file_path\n", " if isinstance(file_path, Iterable) and not isinstance(file_path, str)\n", " else [file_path]\n", " )\n", " converter = DocumentConverter()\n", " for source in file_paths:\n", " dl_doc = converter.convert_single(source).output\n", " match self.parse_type:\n", " case self.ParseType.MARKDOWN:\n", " text = dl_doc.export_to_markdown()\n", " case self.ParseType.JSON:\n", " text = dl_doc.model_dump_json()\n", " case _:\n", " raise RuntimeError(\n", " f\"Unexpected export type encountered: {self.export_type}\"\n", " )\n", " origin = str(source) if isinstance(source, Path) else source\n", " li_doc = LIDocument(text=text)\n", " li_doc.metadata = {\n", " _KEY_DL_DOC_HASH: dl_doc.file_info.document_hash,\n", " }\n", " if self.include_origin:\n", " li_doc.metadata[_KEY_ORIGIN] = origin\n", " yield li_doc" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from typing import Any, Iterable, Sequence\n", "\n", "from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker\n", "from docling_core.types import Document as DLDocument\n", "from llama_index.core import Document as LIDocument\n", "from llama_index.core.node_parser.interface import NodeParser\n", "from llama_index.core.node_parser.node_utils import IdFuncCallable, default_id_func\n", "from llama_index.core.schema import (\n", " BaseNode,\n", " NodeRelationship,\n", " RelatedNodeType,\n", " TextNode,\n", ")\n", "from llama_index.core.utils import get_tqdm_iterable\n", "\n", "\n", "class DoclingNodeParser(NodeParser):\n", " chunker: BaseChunker = HierarchicalChunker(heading_as_metadata=True)\n", "\n", " def _parse_nodes(\n", " self,\n", " nodes: Sequence[BaseNode],\n", " show_progress: bool = False,\n", " **kwargs: Any,\n", " ) -> list[BaseNode]:\n", " id_func: IdFuncCallable = self.id_func or default_id_func\n", " nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(\n", " items=nodes, show_progress=show_progress, desc=\"Parsing nodes\"\n", " )\n", " all_nodes: list[BaseNode] = []\n", " for input_node in nodes_with_progress:\n", " li_doc = LIDocument.model_validate(input_node)\n", " dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())\n", " chunk_iter = self.chunker.chunk(dl_doc=dl_doc)\n", " for i, chunk in enumerate(chunk_iter):\n", " rels: dict[NodeRelationship, RelatedNodeType] = {\n", " NodeRelationship.SOURCE: li_doc.as_related_node_info(),\n", " }\n", " metadata = chunk.model_dump(\n", " exclude=\"text\",\n", " exclude_none=True,\n", " )\n", " # by default we exclude all meta keys from embedding/LLM — unless allowed\n", " excl_meta_keys = [k for k in metadata if k not in {\"heading\"}]\n", " if self.include_metadata:\n", " excl_meta_keys = [k for k in li_doc.metadata] + excl_meta_keys\n", " node = TextNode(\n", " id_=id_func(i=i, doc=li_doc),\n", " text=chunk.text,\n", " excluded_embed_metadata_keys=excl_meta_keys,\n", " excluded_llm_metadata_keys=excl_meta_keys,\n", " relationships=rels,\n", " )\n", " node.metadata = metadata\n", " all_nodes.append(node)\n", " return all_nodes" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "from llama_index.core.base.response.schema import RESPONSE_TYPE\n", "\n", "\n", "def print_qa(query: str, query_res: RESPONSE_TYPE):\n", " def clip(inp, max_len=100):\n", " if isinstance(inp, str):\n", " return f\"{inp[:max_len]}{'...' if len(inp) > max_len else ''}\"\n", " else:\n", " return inp\n", "\n", " print(\n", " f\"Question:\\n{query}\\n\\nAnswer:\\n{json.dumps(clip(query_res.response.strip()))}\"\n", " )\n", " for i, res in enumerate(query_res.source_nodes):\n", " print()\n", " print(f\"Source {i+1}:\")\n", " print(f\" text: {json.dumps(clip(res.text.strip()))}\")\n", " for key in res.metadata:\n", " print(f\" {key}: {clip(res.metadata.get(key))}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reader and node parser" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Using native Docling format (as JSON)**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To leverage Docling's rich document structure format, we can namely export to JSON and use the `DoclingNodeParser` accordingly:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)\n", "node_parser = DoclingNodeParser()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Using Markdown**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# from llama_index.core.node_parser import MarkdownNodeParser\n", "\n", "# reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n", "# node_parser = MarkdownNodeParser()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transformations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our transformations currently include the `node_parser`:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "transformations = [node_parser]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "One can include add more transformations, e.g. further chunking based on text size / overlap, as shown below:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# from llama_index.core.node_parser import TokenTextSplitter\n", "\n", "# splitter = TokenTextSplitter(\n", "# chunk_size=1024,\n", "# chunk_overlap=0,\n", "# )\n", "# transformations.append(splitter)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Embed model" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", "\n", "embed_model = HuggingFaceEmbedding(model_name=\"intfloat/multilingual-e5-small\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Vector store" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "INGEST = True # whether to ingest from scratch or reuse an existing vector store" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from tempfile import TemporaryDirectory\n", "from llama_index.vector_stores.milvus import MilvusVectorStore\n", "\n", "MILVUS_URI = os.environ.get(\n", " \"MILVUS_URI\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n", ")\n", "\n", "vector_store = MilvusVectorStore(\n", " uri=MILVUS_URI,\n", " collection_name=\"docling_li_demo\",\n", " dim=len(embed_model.get_text_embedding(\"hi\")),\n", " overwrite=INGEST,\n", ")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from llama_index.core import StorageContext, VectorStoreIndex\n", "\n", "if INGEST:\n", " # in this case we ingest the data into the vector store\n", " docs = reader.load_data(\n", " file_path=\"https://arxiv.org/pdf/2206.01062\", # DocLayNet paper\n", " )\n", " storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", " index = VectorStoreIndex.from_documents(\n", " documents=docs,\n", " embed_model=embed_model,\n", " storage_context=storage_context,\n", " transformations=transformations,\n", " )\n", "else:\n", " # in this case we just load the vector store index\n", " index = VectorStoreIndex.from_vector_store(\n", " vector_store=vector_store,\n", " embed_model=embed_model,\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### LLM" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n", "\n", "llm = HuggingFaceInferenceAPI(\n", " token=HF_API_KEY,\n", " model_name=\"mistralai/Mistral-7B-Instruct-v0.3\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RAG" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Question:\n", "How many pages were annotated by humans?\n", "\n", "Answer:\n", "\"80863 pages were annotated by humans.\"\n", "\n", "Source 1:\n", " text: \"DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ...\"\n", " dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n", " path: $.main-text[37]\n", " page: 2\n", " bbox: [317.2852478027344, 116.46983337402344, 559.7131958007812, 201.73675537109375]\n", " heading: 3 THE DOCLAYNET DATASET\n", "\n", "Source 2:\n", " text: \"In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-tr...\"\n", " dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n", " path: $.main-text[23]\n", " page: 2\n", " bbox: [53.50020980834961, 212.36782836914062, 295.56396484375, 286.4964599609375]\n", " heading: 1 INTRODUCTION\n" ] } ], "source": [ "query_engine = index.as_query_engine(llm=llm)\n", "QUERY = \"How many pages were annotated by humans?\"\n", "query_res = query_engine.query(QUERY)\n", "print_qa(query=QUERY, query_res=query_res)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }