# RAG with Docling and 🦙 LlamaIndex

In [1]:
# requirements for this example:
%pip install -qq docling docling-core python-dotenv llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import warnings

warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Setup

### Helpers

Below we define:

- `DoclingPDFReader` which will be used to create LlamaIndex documents,
- `DoclingNodeParser`, which can be used to create LlamaIndex nodes out of JSON-based documents, and
- a helper function for QA printing

In [4]:
from enum import Enum
from pathlib import Path
from typing import Any, Iterable

from llama_index.core.readers.base import BasePydanticReader
from llama_index.core.schema import Document as LIDocument

from docling.document_converter import DocumentConverter

_KEY_DL_DOC_HASH = "dl_doc_hash"
_KEY_ORIGIN = "origin"


class DoclingPDFReader(BasePydanticReader):
 class ParseType(str, Enum):
 MARKDOWN = "markdown"
 JSON = "json"

 parse_type: ParseType = ParseType.MARKDOWN
 include_origin: bool = False

 def lazy_load_data(
 self,
 file_path: str | Path | Iterable[str] | Iterable[Path],
 *args: Any,
 **load_kwargs: Any,
 ) -> Iterable[LIDocument]:
 file_paths = (
 file_path
 if isinstance(file_path, Iterable) and not isinstance(file_path, str)
 else [file_path]
 )
 converter = DocumentConverter()
 for source in file_paths:
 dl_doc = converter.convert_single(source).output
 match self.parse_type:
 case self.ParseType.MARKDOWN:
 text = dl_doc.export_to_markdown()
 case self.ParseType.JSON:
 text = dl_doc.model_dump_json()
 case _:
 raise RuntimeError(
 f"Unexpected export type encountered: {self.export_type}"
 )
 origin = str(source) if isinstance(source, Path) else source
 li_doc = LIDocument(text=text)
 li_doc.metadata = {
 _KEY_DL_DOC_HASH: dl_doc.file_info.document_hash,
 }
 if self.include_origin:
 li_doc.metadata[_KEY_ORIGIN] = origin
 yield li_doc

In [5]:
from typing import Any, Iterable, Sequence

from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker
from docling_core.types import Document as DLDocument
from llama_index.core import Document as LIDocument
from llama_index.core.node_parser.interface import NodeParser
from llama_index.core.node_parser.node_utils import IdFuncCallable, default_id_func
from llama_index.core.schema import (
 BaseNode,
 NodeRelationship,
 RelatedNodeType,
 TextNode,
)
from llama_index.core.utils import get_tqdm_iterable


class DoclingNodeParser(NodeParser):
 chunker: BaseChunker = HierarchicalChunker(heading_as_metadata=True)

 def _parse_nodes(
 self,
 nodes: Sequence[BaseNode],
 show_progress: bool = False,
 **kwargs: Any,
 ) -> list[BaseNode]:
 id_func: IdFuncCallable = self.id_func or default_id_func
 nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(
 items=nodes, show_progress=show_progress, desc="Parsing nodes"
 )
 all_nodes: list[BaseNode] = []
 for input_node in nodes_with_progress:
 li_doc = LIDocument.model_validate(input_node)
 dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())
 chunk_iter = self.chunker.chunk(dl_doc=dl_doc)
 for i, chunk in enumerate(chunk_iter):
 rels: dict[NodeRelationship, RelatedNodeType] = {
 NodeRelationship.SOURCE: li_doc.as_related_node_info(),
 }
 metadata = chunk.model_dump(
 exclude="text",
 exclude_none=True,
 )
 # by default we exclude all meta keys from embedding/LLM — unless allowed
 excl_meta_keys = [k for k in metadata if k not in {"heading"}]
 if self.include_metadata:
 excl_meta_keys = [k for k in li_doc.metadata] + excl_meta_keys
 node = TextNode(
 id_=id_func(i=i, doc=li_doc),
 text=chunk.text,
 excluded_embed_metadata_keys=excl_meta_keys,
 excluded_llm_metadata_keys=excl_meta_keys,
 relationships=rels,
 )
 node.metadata = metadata
 all_nodes.append(node)
 return all_nodes

In [6]:
import json

from llama_index.core.base.response.schema import RESPONSE_TYPE


def print_qa(query: str, query_res: RESPONSE_TYPE):
 def clip(inp, max_len=100):
 if isinstance(inp, str):
 return f"{inp[:max_len]}{'...' if len(inp) > max_len else ''}"
 else:
 return inp

 print(
 f"Question:\n{query}\n\nAnswer:\n{json.dumps(clip(query_res.response.strip()))}"
 )
 for i, res in enumerate(query_res.source_nodes):
 print()
 print(f"Source {i+1}:")
 print(f" text: {json.dumps(clip(res.text.strip()))}")
 for key in res.metadata:
 print(f" {key}: {clip(res.metadata.get(key))}")

### Reader and node parser

**Using native Docling format (as JSON)**

To leverage Docling's rich document structure format, we can namely export to JSON and use the `DoclingNodeParser` accordingly:

In [7]:
reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)
node_parser = DoclingNodeParser()

**Using Markdown**

Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:

In [8]:
# from llama_index.core.node_parser import MarkdownNodeParser

# reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)
# node_parser = MarkdownNodeParser()

### Transformations

Our transformations currently include the `node_parser`:

In [9]:
transformations = [node_parser]

One can include add more transformations, e.g. further chunking based on text size / overlap, as shown below:

In [10]:
# from llama_index.core.node_parser import TokenTextSplitter

# splitter = TokenTextSplitter(
# chunk_size=1024,
# chunk_overlap=0,
# )
# transformations.append(splitter)

### Embed model

In [11]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-small")

### Vector store

In [12]:
INGEST = True # whether to ingest from scratch or reuse an existing vector store

In [13]:
from tempfile import TemporaryDirectory
from llama_index.vector_stores.milvus import MilvusVectorStore

MILVUS_URI = os.environ.get(
 "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)

vector_store = MilvusVectorStore(
 uri=MILVUS_URI,
 collection_name="docling_li_demo",
 dim=len(embed_model.get_text_embedding("hi")),
 overwrite=INGEST,
)

In [14]:
from llama_index.core import StorageContext, VectorStoreIndex

if INGEST:
 # in this case we ingest the data into the vector store
 docs = reader.load_data(
 file_path="https://arxiv.org/pdf/2206.01062", # DocLayNet paper
 )
 storage_context = StorageContext.from_defaults(vector_store=vector_store)
 index = VectorStoreIndex.from_documents(
 documents=docs,
 embed_model=embed_model,
 storage_context=storage_context,
 transformations=transformations,
 )
else:
 # in this case we just load the vector store index
 index = VectorStoreIndex.from_vector_store(
 vector_store=vector_store,
 embed_model=embed_model,
 )

### LLM

In [15]:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

HF_API_KEY = os.environ.get("HF_API_KEY")

llm = HuggingFaceInferenceAPI(
 token=HF_API_KEY,
 model_name="mistralai/Mistral-7B-Instruct-v0.3",
)

## RAG

In [16]:
query_engine = index.as_query_engine(llm=llm)
QUERY = "How many pages were annotated by humans?"
query_res = query_engine.query(QUERY)
print_qa(query=QUERY, query_res=query_res)

Question:
How many pages were annotated by humans?

Answer:
"80863 pages were annotated by humans."

Source 1:
 text: "DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ..."
 dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc
 path: $.main-text[37]
 page: 2
 bbox: [317.2852478027344, 116.46983337402344, 559.7131958007812, 201.73675537109375]
 heading: 3 THE DOCLAYNET DATASET

Source 2:
 text: "In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-tr..."
 dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc
 path: $.main-text[23]
 page: 2
 bbox: [53.50020980834961, 212.36782836914062, 295.56396484375, 286.4964599609375]
 heading: 1 INTRODUCTION
