# RAG with Docling and ðŸ¦™ LlamaIndex

In [1]:
# requirements for this example:
%pip install -qq docling docling-core python-dotenv llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from tempfile import TemporaryDirectory

from dotenv import load_dotenv
from pydantic import TypeAdapter
from rich.pretty import pprint

load_dotenv()

True

In [3]:
import warnings

warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")

## Setup

### Helpers

Below we define:

- `DoclingPDFReader` which will be used to create LlamaIndex documents, and
- `HierarchicalJSONNodeParser`, which can be used to create LlamaIndex nodes out of JSON-based documents


In [4]:
from enum import Enum
from pathlib import Path
from typing import Any, Iterable

from llama_index.core.readers.base import BasePydanticReader
from llama_index.core.schema import Document as LIDocument

from docling.document_converter import DocumentConverter

class DocMetaKeys(str, Enum):
    DL_DOC_HASH = "dl_doc_hash"
    ORIGIN = "origin"

class DoclingPDFReader(BasePydanticReader):
    class ParseType(str, Enum):
        MARKDOWN = "markdown"
        JSON = "json"

    parse_type: ParseType = ParseType.MARKDOWN

    def lazy_load_data(
        self,
        file_path: str | Path | Iterable[str] | Iterable[Path],
        *args: Any,
        **load_kwargs: Any,
    ) -> Iterable[LIDocument]:
        file_paths = (
            file_path
            if isinstance(file_path, Iterable) and not isinstance(file_path, str)
            else [file_path]
        )
        converter = DocumentConverter()
        for source in file_paths:
            dl_doc = converter.convert_single(source).output
            match self.parse_type:
                case self.ParseType.MARKDOWN:
                    text = dl_doc.export_to_markdown()
                case self.ParseType.JSON:
                    text = dl_doc.model_dump_json()
                case _:
                    raise RuntimeError(
                        f"Unexpected export type encountered: {self.export_type}"
                    )
            origin = str(source) if isinstance(source, Path) else source
            li_doc = LIDocument(text=text)
            li_doc.metadata = {
                DocMetaKeys.DL_DOC_HASH: dl_doc.file_info.document_hash,
                DocMetaKeys.ORIGIN: origin,
            }
            yield li_doc

In [5]:
from typing import Any, Iterable, Sequence

from docling_core.transforms.chunker import ChunkWithMetadata, HierarchicalChunker
from docling_core.types import Document as DLDocument
from llama_index.core import Document as LIDocument
from llama_index.core.node_parser.interface import NodeParser
from llama_index.core.schema import (
    BaseNode,
    NodeRelationship,
    RelatedNodeType,
    TextNode,
)
from llama_index.core.utils import get_tqdm_iterable


class NodeMetaKeys(str, Enum):
    PATH = "path"
    PAGE = "page"
    BBOX = "bbox"
    ORIGIN = "origin"


class HierarchicalJSONNodeParser(NodeParser):

    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> list[BaseNode]:
        nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(
            items=nodes, show_progress=show_progress, desc="Parsing nodes"
        )
        all_nodes: list[BaseNode] = []
        chunker = HierarchicalChunker()
        for input_node in nodes_with_progress:
            li_doc = LIDocument.model_validate(input_node)
            dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())
            chunk_iter = chunker.chunk(dl_doc=dl_doc)
            for chunk in chunk_iter:
                rels: dict[NodeRelationship, RelatedNodeType] = {
                    NodeRelationship.SOURCE: li_doc.as_related_node_info(),
                }
                excl_doc_meta_keys = [d.value for d in DocMetaKeys]
                excl_node_meta_keys = [n.value for n in NodeMetaKeys]
                excl_meta_keys = excl_doc_meta_keys + excl_node_meta_keys
                node = TextNode(
                    text=chunk.text,
                    excluded_embed_metadata_keys=excl_meta_keys,
                    excluded_llm_metadata_keys=excl_meta_keys,
                    relationships=rels,
                )
                node.metadata = {NodeMetaKeys.PATH: chunk.path}
                if isinstance(chunk, ChunkWithMetadata):
                    node.metadata[NodeMetaKeys.PAGE] = chunk.page
                    node.metadata[NodeMetaKeys.BBOX] = chunk.bbox
                all_nodes.append(node)
        return all_nodes

### Reader and node parser

#### Using JSON

To leverage Docling's rich document structure format, we can namely export to JSON and use the HierarchicalJSONNodeParser accordingly:

In [6]:
reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)
node_parser = HierarchicalJSONNodeParser()

#### Using Markdown

Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:

In [7]:
# from llama_index.core.node_parser import MarkdownNodeParser

# reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)
# node_parser = MarkdownNodeParser()

### Transformations

Our transformations currently include the `node_parser`:

In [8]:
transformations = [node_parser]

One can include add more transformations, e.g. further chunking based on text size / overlap, as shown below:

In [9]:
# from llama_index.core.node_parser import TokenTextSplitter

# splitter = TokenTextSplitter(
#     chunk_size=1024,
#     chunk_overlap=0,
# )
# transformations.append(splitter)

### Embed model

In [10]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-small")

### Vector store

In [11]:
INGEST = True  # whether to ingest from scratch or reuse an existing vector store

In [12]:
from llama_index.vector_stores.milvus import MilvusVectorStore

MILVUS_URL = os.environ.get(
    "MILVUS_URL", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)
MILVUS_COLL_NAME = os.environ.get("MILVUS_COLL_NAME", "basic_llamaindex_pipeline")
MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get("MILVUS_KWARGS", "{}"))

vector_store = MilvusVectorStore(
    uri=MILVUS_URL,
    collection_name=MILVUS_COLL_NAME,
    dim=len(embed_model.get_text_embedding("hi")),
    overwrite=INGEST,
    **MILVUS_KWARGS,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
from llama_index.core import StorageContext, VectorStoreIndex

if INGEST:
    # in this case we ingest the data into the vector store
    docs = reader.load_data(
        file_path="https://arxiv.org/pdf/2206.01062",  # DocLayNet paper
    )
    pprint(docs, max_length=1, max_string=50, max_depth=4)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(
        documents=docs,
        embed_model=embed_model,
        storage_context=storage_context,
        transformations=transformations,
    )
else:
    # in this case we just load the vector store index
    index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        embed_model=embed_model,
    )

### LLM

In [14]:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

HF_API_KEY = os.environ.get("HF_API_KEY")

llm = HuggingFaceInferenceAPI(
    token=HF_API_KEY,
    model_name="mistralai/Mistral-7B-Instruct-v0.3",
)

## RAG

In [15]:
query_engine = index.as_query_engine(llm=llm)
query_res = query_engine.query("How many pages were annotated by humans?")
pprint(query_res, max_length=5, max_string=250, max_depth=6)