docling/docs/examples/advanced_chunking_with_merging.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Advanced Chunking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install -qU docling docling-core sentence-transformers transformers semchunk lancedb pydantic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "from pathlib import Path\n",
    "from tempfile import mkdtemp\n",
    "from typing import Any, Iterator, Optional\n",
    "\n",
    "import lancedb\n",
    "import semchunk\n",
    "from docling_core.transforms.chunker import (\n",
    "    BaseChunk,\n",
    "    BaseChunker,\n",
    "    DocMeta,\n",
    "    HierarchicalChunker,\n",
    ")\n",
    "from docling_core.transforms.chunker.hierarchical_chunker import DocChunk\n",
    "from docling_core.types import DoclingDocument\n",
    "from pydantic import ConfigDict, PositiveInt\n",
    "from sentence_transformers import SentenceTransformer\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "from docling.document_converter import DocumentConverter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "DOC_SOURCE = \"http://bill.murdocks.org/iccbr2011murdock_web.pdf\"\n",
    "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
    "TOKENIZER = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)\n",
    "EMBED_MODEL = SentenceTransformer(EMBED_MODEL_ID)\n",
    "MAX_TOKENS = 64"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chunker Definition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class HybridChunker(BaseChunker):\n",
    "\n",
    "    model_config: ConfigDict = ConfigDict(arbitrary_types_allowed=True)\n",
    "\n",
    "    inner_chunker: BaseChunker = HierarchicalChunker()\n",
    "    # TODO: improve typing for tokenizer below (ran into issues with `PreTrainedTokenizer`):\n",
    "    tokenizer: Any\n",
    "    max_tokens: PositiveInt\n",
    "\n",
    "    def _count_tokens(self, text: Optional[list[str]]):\n",
    "        if text is None:\n",
    "            return 0\n",
    "        elif isinstance(text, list):\n",
    "            total = 0\n",
    "            for t in text:\n",
    "                total += self._count_tokens(t)\n",
    "            return total\n",
    "        return len(self.tokenizer.tokenize(text, max_length=None))\n",
    "\n",
    "    def _make_splitter(self):\n",
    "        return semchunk.chunkerify(self.tokenizer, self.max_tokens)\n",
    "\n",
    "    @dataclass\n",
    "    class _ChunkLengthInfo:\n",
    "        total_len: int\n",
    "        text_len: int\n",
    "        other_len: int\n",
    "\n",
    "    def _doc_chunk_length(self, doc_chunk: DocChunk):\n",
    "        text_length = self._count_tokens(doc_chunk.text)\n",
    "        # Note that count_tokens handles None and lists, making this code simpler:\n",
    "        headings_length = self._count_tokens(doc_chunk.meta.headings)\n",
    "        captions_length = self._count_tokens(doc_chunk.meta.captions)\n",
    "        total = text_length + headings_length + captions_length\n",
    "        return self._ChunkLengthInfo(\n",
    "            total_len=total,\n",
    "            text_len=text_length,\n",
    "            other_len=total - text_length,\n",
    "        )\n",
    "\n",
    "    @classmethod\n",
    "    def _make_chunk_from_doc_items(\n",
    "        cls, doc_chunk: DocChunk, window_text: str, window_start: int, window_end: int\n",
    "    ):\n",
    "        meta = DocMeta(\n",
    "            doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],\n",
    "            headings=doc_chunk.meta.headings,\n",
    "            captions=doc_chunk.meta.captions,\n",
    "        )\n",
    "        new_chunk = DocChunk(text=window_text, meta=meta)\n",
    "        return new_chunk\n",
    "\n",
    "    @classmethod\n",
    "    def _merge_text(cls, t1, t2):\n",
    "        if t1 == \"\":\n",
    "            return t2\n",
    "        elif t2 == \"\":\n",
    "            return t1\n",
    "        else:\n",
    "            return t1 + \"\\n\" + t2\n",
    "\n",
    "    def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:\n",
    "        if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n",
    "            return [doc_chunk]\n",
    "        length = self._doc_chunk_length(doc_chunk)\n",
    "        if length.total_len <= self.max_tokens:\n",
    "            return [doc_chunk]\n",
    "        else:\n",
    "            chunks = []\n",
    "            window_start = 0\n",
    "            window_end = 0\n",
    "            window_text = \"\"\n",
    "            window_text_length = 0\n",
    "            other_length = length.other_len\n",
    "            l = len(doc_chunk.meta.doc_items)\n",
    "            while window_end < l:\n",
    "                doc_item = doc_chunk.meta.doc_items[window_end]\n",
    "                text = doc_item.text\n",
    "                text_length = self._count_tokens(text)\n",
    "                if (\n",
    "                    text_length + window_text_length + other_length < self.max_tokens\n",
    "                    and window_end < l - 1\n",
    "                ):\n",
    "                    # Still room left to add more to this chunk AND still at least one item left\n",
    "                    window_end += 1\n",
    "                    window_text_length += text_length\n",
    "                    window_text = self._merge_text(window_text, text)\n",
    "                elif text_length + window_text_length + other_length < self.max_tokens:\n",
    "                    # All the items in the window fit into the chunk and there are no other items left\n",
    "                    window_text = self._merge_text(window_text, text)\n",
    "                    new_chunk = self._make_chunk_from_doc_items(\n",
    "                        doc_chunk, window_text, window_start, window_end\n",
    "                    )\n",
    "                    chunks.append(new_chunk)\n",
    "                    window_end = l\n",
    "                elif window_start == window_end:\n",
    "                    # Only one item in the window and it doesn't fit into the chunk.  So we'll just make it a chunk for now and it will get split in the plain text splitter.\n",
    "                    window_text = self._merge_text(window_text, text)\n",
    "                    new_chunk = self._make_chunk_from_doc_items(\n",
    "                        doc_chunk, window_text, window_start, window_end\n",
    "                    )\n",
    "                    chunks.append(new_chunk)\n",
    "                    window_start = window_end + 1\n",
    "                    window_end = window_start\n",
    "                    window_text = \"\"\n",
    "                    window_text_length = 0\n",
    "                else:\n",
    "                    # Multiple items in the window but they don't fit into the chunk.  However, the existing items must have fit or we wouldn't have gotten here.\n",
    "                    # So we put everything but the last item into the chunk and then start a new window INCLUDING the current window end.\n",
    "                    new_chunk = self._make_chunk_from_doc_items(\n",
    "                        doc_chunk, window_text, window_start, window_end - 1\n",
    "                    )\n",
    "                    chunks.append(new_chunk)\n",
    "                    window_start = window_end\n",
    "                    window_text = \"\"\n",
    "                    window_text_length = 0\n",
    "            return chunks\n",
    "\n",
    "    def _split_using_plain_text(\n",
    "        self,\n",
    "        doc_chunk: DocChunk,\n",
    "        plain_text_splitter,\n",
    "    ):\n",
    "        lengths = self._doc_chunk_length(doc_chunk)\n",
    "        if lengths.total_len <= self.max_tokens:\n",
    "            return [doc_chunk]\n",
    "        else:\n",
    "            # How much room is there for text after subtracting out the headers and captions:\n",
    "            available_length = self.max_tokens - lengths.other_len\n",
    "            if available_length <= 0:\n",
    "                raise ValueError(\n",
    "                    \"Headers and captions for this chunk are longer than the total amount of size for the chunk.  This is not supported now.\"\n",
    "                )\n",
    "            text = doc_chunk.text\n",
    "            segments = plain_text_splitter.chunk(text)\n",
    "            chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments]\n",
    "            return chunks\n",
    "\n",
    "    def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):\n",
    "        output_chunks = []\n",
    "        window_start = 0\n",
    "        window_end = 0\n",
    "        l = len(chunks)\n",
    "        while window_end < l:\n",
    "            chunk = chunks[window_end]\n",
    "            lengths = self._doc_chunk_length(chunk)\n",
    "            headings_and_captions = (chunk.meta.headings, chunk.meta.captions)\n",
    "            if window_start == window_end:\n",
    "                # starting a new block of chunks to potentially merge\n",
    "                current_headings_and_captions = headings_and_captions\n",
    "                window_text = chunk.text\n",
    "                window_other_length = lengths.other_len\n",
    "                window_text_length = lengths.text_len\n",
    "                window_items = chunk.meta.doc_items\n",
    "                window_end += 1\n",
    "                first_chunk_of_window = chunk\n",
    "            elif (\n",
    "                headings_and_captions == current_headings_and_captions\n",
    "                and window_text_length + window_other_length + lengths.text_len\n",
    "                <= self.max_tokens\n",
    "            ):\n",
    "                # there is room to include the new chunk so add it to the window and continue\n",
    "                window_text = self._merge_text(window_text, chunk.text)\n",
    "                window_text_length += lengths.text_len\n",
    "                window_items = window_items + chunk.meta.doc_items\n",
    "                window_end += 1\n",
    "            else:\n",
    "                # no more room OR the start of new metadata.  Either way, end the block and use the current window_end as the start of a new block\n",
    "                if window_start + 1 == window_end:\n",
    "                    # just one chunk so use it as is\n",
    "                    output_chunks.append(first_chunk_of_window)\n",
    "                else:\n",
    "                    new_meta = DocMeta(\n",
    "                        doc_items=window_items,\n",
    "                        headings=headings_and_captions[0],\n",
    "                        captions=headings_and_captions[1],\n",
    "                    )\n",
    "                    new_chunk = DocChunk(text=window_text, meta=new_meta)\n",
    "                    output_chunks.append(new_chunk)\n",
    "                window_start = window_end  # no need to reset window_text, etc. because that will be reset in the next iteration in the if window_start == window_end block\n",
    "\n",
    "        return output_chunks\n",
    "\n",
    "    def _merge_chunks_with_mismatching_metadata(self, chunks, *_):\n",
    "        # placeholder, for now we're not merging across text with different headings+captions\n",
    "        # in principal it seems like a good idea for cases where you can merge entire sections\n",
    "        # but it is not clear what you do about the metadata then because some of it apples to\n",
    "        return chunks\n",
    "\n",
    "    def _merge_chunks(self, chunks: list[DocChunk]) -> list[DocChunk]:\n",
    "        # merges as many chunks as possible that have the same headings+captions.\n",
    "        initial_merged_chunks = self._merge_chunks_with_matching_metadata(chunks)\n",
    "        # merges chunks with different headings+captions.  This is later so that merges within a section or other grouping are preferred.\n",
    "        final_merged_chunks = self._merge_chunks_with_mismatching_metadata(\n",
    "            initial_merged_chunks\n",
    "        )\n",
    "        return final_merged_chunks\n",
    "\n",
    "    @classmethod\n",
    "    def _make_text_for_embedding(cls, chunk: DocChunk):\n",
    "        output = \"\"\n",
    "        if chunk.meta.headings != None:\n",
    "            for h in chunk.meta.headings:\n",
    "                output += h + \"\\n\"\n",
    "        if chunk.meta.captions != None:\n",
    "            for c in chunk.meta.captions:\n",
    "                output += c + \"\\n\"\n",
    "        output += chunk.text\n",
    "        return output\n",
    "\n",
    "    def _adjust_chunks_for_fixed_size(self, chunks: list[DocChunk], splitter):\n",
    "        split_by_items = [x for c in chunks for x in self._split_by_doc_items(c)]\n",
    "        split_recursively = [\n",
    "            x for c in split_by_items for x in self._split_using_plain_text(c, splitter)\n",
    "        ]\n",
    "        merged = self._merge_chunks(split_recursively)\n",
    "        text_expanded = [\n",
    "            DocChunk.model_validate(\n",
    "                {**c.model_dump(), \"text\": self._make_text_for_embedding(c)}\n",
    "            )\n",
    "            for c in merged\n",
    "        ]\n",
    "        return text_expanded\n",
    "\n",
    "    def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:\n",
    "        preliminary_chunks = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)\n",
    "        splitter = self._make_splitter()\n",
    "        output_chunks = self._adjust_chunks_for_fixed_size(preliminary_chunks, splitter)\n",
    "        return iter(output_chunks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Usage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using CPU. Note: This module is much faster with a GPU.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "J. William Murdock\n",
      "murdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598\n",
      "39\n",
      "J. William Murdock\n",
      "Abstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct answer to the question. Recognizing\n",
      "70\n",
      "J. William Murdock\n",
      "whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied to determine similarity between content in questions and passages. That algorithm\n",
      "70\n",
      "J. William Murdock\n",
      "is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\n",
      "32\n",
      "1 Introduction\n",
      "Watson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.\n",
      "62\n"
     ]
    }
   ],
   "source": [
    "conv_res = DocumentConverter().convert(source=DOC_SOURCE)\n",
    "doc = conv_res.document\n",
    "\n",
    "chunker = HybridChunker(\n",
    "    tokenizer=TOKENIZER,\n",
    "    max_tokens=MAX_TOKENS,\n",
    ")\n",
    "chunks = list(chunker.chunk(dl_doc=doc))\n",
    "\n",
    "for chunk in chunks[:5]:\n",
    "    print(chunk.text)\n",
    "    print(chunker._count_tokens(chunk.text))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Vector Retrieval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>vector</th>\n",
       "      <th>text</th>\n",
       "      <th>headings</th>\n",
       "      <th>captions</th>\n",
       "      <th>_distance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[-0.025746439, 0.03888134, 0.0033668755, -0.03...</td>\n",
       "      <td>References\\n3. Forbus, K. and Oblinger, D. (19...</td>\n",
       "      <td>[References]</td>\n",
       "      <td>None</td>\n",
       "      <td>0.332435</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[0.034203544, 0.10181023, 0.003722408, 0.00506...</td>\n",
       "      <td>5 Evaluation and Conclusions\\nconsider to be a...</td>\n",
       "      <td>[5 Evaluation and Conclusions]</td>\n",
       "      <td>None</td>\n",
       "      <td>1.469304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[0.04400234, -0.034766007, -0.00025527124, 0.0...</td>\n",
       "      <td>References\\n4. McCord, M. C. (1990). Slot Gram...</td>\n",
       "      <td>[References]</td>\n",
       "      <td>None</td>\n",
       "      <td>1.525625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[0.112926826, -0.010892201, 0.007714559, -0.06...</td>\n",
       "      <td>3 Syntactic-Semantic Graphs\\nplay , about , Ut...</td>\n",
       "      <td>[3 Syntactic-Semantic Graphs]</td>\n",
       "      <td>None</td>\n",
       "      <td>1.540550</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[0.025994677, 0.08402823, 0.03268827, -0.03727...</td>\n",
       "      <td>4 Algorithm\\nIn using this algorithm, we have ...</td>\n",
       "      <td>[4 Algorithm]</td>\n",
       "      <td>None</td>\n",
       "      <td>1.576838</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              vector  \\\n",
       "0  [-0.025746439, 0.03888134, 0.0033668755, -0.03...   \n",
       "1  [0.034203544, 0.10181023, 0.003722408, 0.00506...   \n",
       "2  [0.04400234, -0.034766007, -0.00025527124, 0.0...   \n",
       "3  [0.112926826, -0.010892201, 0.007714559, -0.06...   \n",
       "4  [0.025994677, 0.08402823, 0.03268827, -0.03727...   \n",
       "\n",
       "                                                text  \\\n",
       "0  References\\n3. Forbus, K. and Oblinger, D. (19...   \n",
       "1  5 Evaluation and Conclusions\\nconsider to be a...   \n",
       "2  References\\n4. McCord, M. C. (1990). Slot Gram...   \n",
       "3  3 Syntactic-Semantic Graphs\\nplay , about , Ut...   \n",
       "4  4 Algorithm\\nIn using this algorithm, we have ...   \n",
       "\n",
       "                         headings captions  _distance  \n",
       "0                    [References]     None   0.332435  \n",
       "1  [5 Evaluation and Conclusions]     None   1.469304  \n",
       "2                    [References]     None   1.525625  \n",
       "3   [3 Syntactic-Semantic Graphs]     None   1.540550  \n",
       "4                   [4 Algorithm]     None   1.576838  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def make_lancedb_index(db_uri, index_name, chunks: list[DocChunk], embedding_model):\n",
    "    db = lancedb.connect(db_uri)\n",
    "    data = []\n",
    "    for chunk in chunks:\n",
    "        embeddings = embedding_model.encode(chunk.text)\n",
    "        data_item = {\n",
    "            \"vector\": embeddings,\n",
    "            \"text\": chunk.text,\n",
    "            \"headings\": chunk.meta.headings,\n",
    "            \"captions\": chunk.meta.captions,\n",
    "        }\n",
    "        data.append(data_item)\n",
    "    tbl = db.create_table(index_name, data=data, exist_ok=True)\n",
    "    return tbl\n",
    "\n",
    "\n",
    "db_uri = str(Path(mkdtemp()) / \"docling.db\")  # or set as needed\n",
    "index = make_lancedb_index(db_uri, doc.name, chunks, EMBED_MODEL)\n",
    "\n",
    "sample_query = \"Making SME greedy and pragmatic\"\n",
    "sample_embedding = EMBED_MODEL.encode(sample_query)\n",
    "results = index.search(sample_embedding).limit(5)\n",
    "\n",
    "results.to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}