# RAG with Docling and ðŸ¦œðŸ”— LangChain

In [1]:
# requirements for this example:
%pip install -qq docling docling-core python-dotenv langchain langchain-text-splitters langchain-huggingface langchain-milvus

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import warnings

warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Setup

### Helpers

Below we set up:
- a `Loader` which will be used to create LangChain documents,
- a splitter, which will be used to split these documents, and
- a helper function for QA printing

In [4]:
from enum import Enum
from typing import Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument

from docling.document_converter import DocumentConverter

_KEY_DL_DOC_HASH = "dl_doc_hash"
_KEY_ORIGIN = "origin"


class DoclingPDFLoader(BaseLoader):
    class ParseType(str, Enum):
        MARKDOWN = "markdown"
        JSON = "json"

    include_origin: bool = False

    def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._parse_type = parse_type
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert_single(source).output
            match self._parse_type:
                case self.ParseType.MARKDOWN:
                    text = dl_doc.export_to_markdown()
                case self.ParseType.JSON:
                    text = dl_doc.model_dump_json()
                case _:
                    raise RuntimeError(
                        f"Unexpected parse type encountered: {self._parse_type}"
                    )
            metadata = {
                _KEY_DL_DOC_HASH: dl_doc.file_info.document_hash,
            }
            if self.include_origin:
                metadata[_KEY_ORIGIN] = source

            lc_doc = LCDocument(
                page_content=text,
                metadata=metadata,
            )
            yield lc_doc

In [5]:
import json
from typing import Iterable, List

from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker
from docling_core.types import Document as DLDocument
from langchain_core.documents import Document as LCDocument


class DoclingSplitter:

    def __init__(
        self,
        chunker: BaseChunker | None = None,
    ) -> None:
        self.chunker: BaseChunker = chunker or HierarchicalChunker(
            heading_as_metadata=True
        )

    def split_documents(self, documents: Iterable[LCDocument]) -> List[LCDocument]:

        all_chunk_docs: list[LCDocument] = []
        for doc in documents:
            lc_doc: LCDocument = LCDocument.parse_obj(doc)
            dl_doc: DLDocument = DLDocument.model_validate_json(lc_doc.page_content)
            chunk_iter = self.chunker.chunk(dl_doc=dl_doc)
            for chunk in chunk_iter:
                chunk_metadata = chunk.model_dump(
                    exclude="text",
                    exclude_none=True,
                )
                metadata = {**lc_doc.metadata, **chunk_metadata}
                for k, v in metadata.items():
                    if isinstance(v, Iterable) and not isinstance(v, str):
                        metadata[k] = json.dumps(v)
                chunk_doc = LCDocument(
                    page_content=chunk.text,
                    metadata=metadata,
                )
                all_chunk_docs.append(chunk_doc)

        return all_chunk_docs

In [6]:
def print_qa(resp_dict):
    def clip(inp, max_len=100):
        if isinstance(inp, str):
            return f"{inp[:max_len]}{'...' if len(inp) > max_len else ''}"
        else:
            return inp

    print(
        f"Question:\n{resp_dict['input']}\n\nAnswer:\n{json.dumps(clip(resp_dict['answer']))}"
    )
    for i, doc in enumerate(resp_dict["context"]):
        print()
        print(f"Source {i+1}:")
        print(f"  text: {json.dumps(clip(doc.page_content))}")
        for key in doc.metadata:
            if key != "pk":
                print(f"  {key}: {clip(doc.metadata.get(key))}")

In [7]:
FILE_PATH = "https://arxiv.org/pdf/2206.01062"  # DocLayNet paper

### Loader and splitter

**Using native Docling format (as JSON)**

To leverage Docling's rich document structure format, we can namely export to JSON and use the `DoclingSplitter` accordingly:

In [8]:
loader = DoclingPDFLoader(
    file_path=FILE_PATH,
    parse_type=DoclingPDFLoader.ParseType.JSON,
)
splitter = DoclingSplitter()

**Using Markdown:**

Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:

In [9]:
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# loader = DoclingPDFLoader(
#     file_path=FILE_PATH,
#     parse_type=DoclingPDFLoader.ParseType.MARKDOWN,
# )
# splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000,
#     chunk_overlap=200,
# )

We now used the above-defined objects to get the document splits:

In [10]:
docs = loader.load()
splits = splitter.split_documents(docs)

### Embed model

In [11]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embedding = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)

### Vector store

In [12]:
from tempfile import TemporaryDirectory

from langchain_milvus import Milvus

MILVUS_URI = os.environ.get(
    "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)

vectorstore = Milvus.from_documents(
    splits,
    embedding,
    connection_args={"uri": MILVUS_URI},
    collection_name="docling_lc_demo",
    drop_old=True,
)

### LLM

In [13]:
from langchain_huggingface import HuggingFaceEndpoint

HF_API_KEY = os.environ.get("HF_API_KEY")
HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

llm = HuggingFaceEndpoint(
    repo_id=HF_LLM_MODEL_ID,
    huggingfacehub_api_token=HF_API_KEY,
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/pva/.cache/huggingface/token
Login successful


## RAG

In [14]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import PromptTemplate

retriever = vectorstore.as_retriever()
prompt = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {input}\nAnswer:\n"
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [15]:
resp_dict = rag_chain.invoke(
    {"input": "How many pages were human annotated by humans for DocLayNet?"}
)
print_qa(resp_dict=resp_dict)

Question:
How many pages were human annotated by humans for DocLayNet?

Answer:
"80863 pages were annotated by humans in DocLayNet."

Source 1:
  text: "DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ..."
  bbox: [317.2852478027344, 116.46983337402344, 559.7131958007812, 201.73675537109375]
  dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc
  heading: 3 THE DOCLAYNET DATASET
  page: 2
  path: $.main-text[37]

Source 2:
  text: "In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-tr..."
  bbox: [53.50020980834961, 212.36782836914062, 295.56396484375, 286.4964599609375]
  dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc
  heading: 1 INTRODUCTION
  page: 2
  path: $.main-text[23]

Source 3:
  text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
  bbox: [53.60108947753906, 723.3781127929688, 347.139892578125,