{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Advanced Chunking" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# %pip install -qU docling docling-core sentence-transformers transformers semchunk lancedb pydantic\n", "\n", "# FIXME temp install line\n", "%pip install -qU \"docling-core @ git+https://github.com/DS4SD/docling-core.git@expand-chunking\" sentence-transformers transformers semchunk lancedb pydantic" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "from dataclasses import dataclass\n", "from pathlib import Path\n", "from tempfile import mkdtemp\n", "from typing import Iterator, Optional, Self, Union\n", "\n", "import lancedb\n", "import semchunk\n", "from docling_core.transforms.chunker import (\n", " BaseChunk,\n", " BaseChunker,\n", " DocChunk,\n", " DocMeta,\n", " HierarchicalChunker,\n", ")\n", "from docling_core.types import DoclingDocument\n", "from pydantic import ConfigDict, PositiveInt, TypeAdapter, model_validator\n", "from sentence_transformers import SentenceTransformer\n", "from transformers import AutoTokenizer, PreTrainedTokenizerBase\n", "\n", "from docling.document_converter import DocumentConverter" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", "MAX_TOKENS = 64\n", "DOC_SOURCE = \"http://bill.murdocks.org/iccbr2011murdock_web.pdf\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)\n", "embed_model = SentenceTransformer(EMBED_MODEL_ID)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Chunker Definition" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "class DocChunker(BaseChunker):\n", "\n", " model_config: ConfigDict = ConfigDict(arbitrary_types_allowed=True)\n", "\n", " tokenizer: PreTrainedTokenizerBase\n", "\n", " inner_chunker: BaseChunker = HierarchicalChunker()\n", " max_tokens: int = None # actual dflt value resolved in validator based on tokenizer\n", " delim: str = \"\\n\"\n", "\n", " @model_validator(mode=\"after\")\n", " def patch_max_tokens(self) -> Self:\n", " if self.max_tokens is None:\n", " print(f\"{self.tokenizer.model_max_length=}\")\n", " self.max_tokens = TypeAdapter(PositiveInt).validate_python(\n", " self.tokenizer.model_max_length\n", " )\n", " return self\n", "\n", " def _count_tokens(self, text: Optional[Union[str, list[str]]]):\n", " if text is None:\n", " return 0\n", " elif isinstance(text, list):\n", " total = 0\n", " for t in text:\n", " total += self._count_tokens(t)\n", " return total\n", " return len(self.tokenizer.tokenize(text, max_length=None))\n", "\n", " @dataclass\n", " class _ChunkLengthInfo:\n", " total_len: int\n", " text_len: int\n", " other_len: int\n", "\n", " def _doc_chunk_length(self, doc_chunk: DocChunk):\n", " text_length = self._count_tokens(doc_chunk.text)\n", " # Note that count_tokens handles None and lists, making this code simpler:\n", " # TODO check if delim properly considered\n", " headings_length = self._count_tokens(doc_chunk.meta.headings)\n", " captions_length = self._count_tokens(doc_chunk.meta.captions)\n", " total = text_length + headings_length + captions_length\n", " return self._ChunkLengthInfo(\n", " total_len=total,\n", " text_len=text_length,\n", " other_len=total - text_length,\n", " )\n", "\n", " def _make_chunk_from_doc_items(\n", " self, doc_chunk: DocChunk, window_text: str, window_start: int, window_end: int\n", " ):\n", " meta = DocMeta(\n", " doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],\n", " headings=doc_chunk.meta.headings,\n", " captions=doc_chunk.meta.captions,\n", " )\n", " new_chunk = DocChunk.from_data(text=window_text, meta=meta, delim=self.delim)\n", " return new_chunk\n", "\n", " def _merge_text(self, t1, t2):\n", " if t1 == \"\":\n", " return t2\n", " elif t2 == \"\":\n", " return t1\n", " else:\n", " return f\"{t1}{self.delim}{t2}\"\n", "\n", " def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:\n", " if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n", " return [doc_chunk]\n", " length = self._doc_chunk_length(doc_chunk)\n", " if length.total_len <= self.max_tokens:\n", " return [doc_chunk]\n", " else:\n", " chunks = []\n", " window_start = 0\n", " window_end = 0\n", " window_text = \"\"\n", " window_text_length = 0\n", " other_length = length.other_len\n", " l = len(doc_chunk.meta.doc_items)\n", " while window_end < l:\n", " doc_item = doc_chunk.meta.doc_items[window_end]\n", " text = doc_item.text\n", " text_length = self._count_tokens(text)\n", " if (\n", " text_length + window_text_length + other_length < self.max_tokens\n", " and window_end < l - 1\n", " ):\n", " # Still room left to add more to this chunk AND still at least one item left\n", " window_end += 1\n", " window_text_length += text_length\n", " window_text = self._merge_text(window_text, text)\n", " elif text_length + window_text_length + other_length < self.max_tokens:\n", " # All the items in the window fit into the chunk and there are no other items left\n", " window_text = self._merge_text(window_text, text)\n", " new_chunk = self._make_chunk_from_doc_items(\n", " doc_chunk, window_text, window_start, window_end\n", " )\n", " chunks.append(new_chunk)\n", " window_end = l\n", " elif window_start == window_end:\n", " # Only one item in the window and it doesn't fit into the chunk. So we'll just make it a chunk for now and it will get split in the plain text splitter.\n", " window_text = self._merge_text(window_text, text)\n", " new_chunk = self._make_chunk_from_doc_items(\n", " doc_chunk, window_text, window_start, window_end\n", " )\n", " chunks.append(new_chunk)\n", " window_start = window_end + 1\n", " window_end = window_start\n", " window_text = \"\"\n", " window_text_length = 0\n", " else:\n", " # Multiple items in the window but they don't fit into the chunk. However, the existing items must have fit or we wouldn't have gotten here.\n", " # So we put everything but the last item into the chunk and then start a new window INCLUDING the current window end.\n", " new_chunk = self._make_chunk_from_doc_items(\n", " doc_chunk, window_text, window_start, window_end - 1\n", " )\n", " chunks.append(new_chunk)\n", " window_start = window_end\n", " window_text = \"\"\n", " window_text_length = 0\n", " return chunks\n", "\n", " def _split_using_plain_text(\n", " self,\n", " doc_chunk: DocChunk,\n", " ) -> list[DocChunk]:\n", " lengths = self._doc_chunk_length(doc_chunk)\n", " if lengths.total_len <= self.max_tokens:\n", " return [\n", " DocChunk.from_data(\n", " delim=self.delim,\n", " **doc_chunk.export_json_dict(),\n", " )\n", " ]\n", " else:\n", "\n", " # How much room is there for text after subtracting out the headers and captions:\n", " available_length = self.max_tokens - lengths.other_len\n", " sem_chunker = semchunk.chunkerify(\n", " self.tokenizer, chunk_size=available_length\n", " )\n", " if available_length <= 0:\n", " warnings.warn(\n", " f\"Headers and captions for this chunk are longer than the total amount of size for the chunk. Chunk will be ignored.\"\n", " )\n", " return []\n", " text = doc_chunk.text\n", " segments = sem_chunker.chunk(text)\n", " chunks = [\n", " DocChunk.from_data(text=s, meta=doc_chunk.meta, delim=self.delim)\n", " for s in segments\n", " ]\n", " return chunks\n", "\n", " def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):\n", " output_chunks = []\n", " window_start = 0\n", " window_end = 0\n", " l = len(chunks)\n", " while window_end < l:\n", " chunk = chunks[window_end]\n", " lengths = self._doc_chunk_length(chunk)\n", " headings_and_captions = (chunk.meta.headings, chunk.meta.captions)\n", " if window_start == window_end:\n", " # starting a new block of chunks to potentially merge\n", " current_headings_and_captions = headings_and_captions\n", " window_text = chunk.text\n", " window_other_length = lengths.other_len\n", " window_text_length = lengths.text_len\n", " window_items = chunk.meta.doc_items\n", " window_end += 1\n", " first_chunk_of_window = chunk\n", " elif (\n", " headings_and_captions == current_headings_and_captions\n", " and window_text_length + window_other_length + lengths.text_len\n", " <= self.max_tokens\n", " ):\n", " # there is room to include the new chunk so add it to the window and continue\n", " window_text = self._merge_text(window_text, chunk.text)\n", " window_text_length += lengths.text_len\n", " window_items = window_items + chunk.meta.doc_items\n", " window_end += 1\n", " else:\n", " # no more room OR the start of new metadata. Either way, end the block and use the current window_end as the start of a new block\n", " if window_start + 1 == window_end:\n", " # just one chunk so use it as is\n", " output_chunks.append(first_chunk_of_window)\n", " else:\n", " new_meta = DocMeta(\n", " doc_items=window_items,\n", " headings=headings_and_captions[0],\n", " captions=headings_and_captions[1],\n", " )\n", " new_chunk = DocChunk.from_data(\n", " text=window_text,\n", " meta=new_meta,\n", " delim=self.delim,\n", " )\n", " output_chunks.append(new_chunk)\n", " window_start = window_end # no need to reset window_text, etc. because that will be reset in the next iteration in the if window_start == window_end block\n", "\n", " return output_chunks\n", "\n", " def _merge_chunks(self, chunks: list[DocChunk]) -> list[DocChunk]:\n", " res = chunks\n", " # merges as many chunks as possible that have the same headings+captions.\n", " res = self._merge_chunks_with_matching_metadata(res)\n", " # merges chunks with different headings+captions. This is later so that merges within a section or other grouping are preferred.\n", " # res = self._merge_chunks_with_mismatching_metadata(res)\n", " return res\n", "\n", " def _adjust_chunks_for_fixed_size(self, chunks: list[DocChunk]):\n", " res = chunks\n", " res = [x for c in res for x in self._split_by_doc_items(c)]\n", " res = [x for c in res for x in self._split_using_plain_text(c)]\n", " res = self._merge_chunks(res)\n", " return res\n", "\n", " def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:\n", " preliminary_chunks = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)\n", " output_chunks = self._adjust_chunks_for_fixed_size(preliminary_chunks)\n", " return iter(output_chunks)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Usage" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "conv_res = DocumentConverter().convert(source=DOC_SOURCE)\n", "doc = conv_res.document" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "chunk.text (33 tokens):\n", "'murdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598'\n", "chunk.get_text_for_embedding() (39 tokens):\n", "'J. William Murdock\\nmurdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598'\n", "chunk.get_text_for_generation() (39 tokens):\n", "'J. William Murdock\\nmurdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598'\n", "\n", "chunk.text (58 tokens):\n", "'Abstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct'\n", "chunk.get_text_for_embedding() (64 tokens):\n", "'J. William Murdock\\nAbstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct'\n", "chunk.get_text_for_generation() (64 tokens):\n", "'J. William Murdock\\nAbstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct'\n", "\n", "chunk.text (58 tokens):\n", "'answer to the question. Recognizing whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied'\n", "chunk.get_text_for_embedding() (64 tokens):\n", "'J. William Murdock\\nanswer to the question. Recognizing whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied'\n", "chunk.get_text_for_generation() (64 tokens):\n", "'J. William Murdock\\nanswer to the question. Recognizing whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied'\n", "\n", "chunk.text (38 tokens):\n", "\"to determine similarity between content in questions and passages. That algorithm is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\"\n", "chunk.get_text_for_embedding() (44 tokens):\n", "\"J. William Murdock\\nto determine similarity between content in questions and passages. That algorithm is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\"\n", "chunk.get_text_for_generation() (44 tokens):\n", "\"J. William Murdock\\nto determine similarity between content in questions and passages. That algorithm is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\"\n", "\n", "chunk.text (60 tokens):\n", "'Watson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.'\n", "chunk.get_text_for_embedding() (62 tokens):\n", "'1 Introduction\\nWatson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.'\n", "chunk.get_text_for_generation() (62 tokens):\n", "'1 Introduction\\nWatson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.'\n", "\n" ] } ], "source": [ "chunker = DocChunker(\n", " tokenizer=tokenizer,\n", " max_tokens=MAX_TOKENS, # optional, derived from `tokenizer` if not provided\n", ")\n", "chunks = list(chunker.chunk(dl_doc=doc))\n", "\n", "for chunk in chunks[:5]:\n", " txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n", " print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n", " emb_txt = chunk.get_text_for_embedding()\n", " emb_tokens = len(tokenizer.tokenize(emb_txt, max_length=None))\n", " print(f\"chunk.get_text_for_embedding() ({emb_tokens} tokens):\\n{repr(emb_txt)}\")\n", " gen_txt = chunk.get_text_for_generation()\n", " gen_tokens = len(tokenizer.tokenize(gen_txt, max_length=None))\n", " print(f\"chunk.get_text_for_generation() ({gen_tokens} tokens):\\n{repr(gen_txt)}\")\n", " print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Vector Retrieval" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
vectortextheadingscaptions_distance
0[-0.025746439, 0.03888134, 0.0033668755, -0.03...3. Forbus, K. and Oblinger, D. (1990). Making ...[References]None0.332435
1[0.04400234, -0.034766007, -0.00025527124, 0.0...4. McCord, M. C. (1990). Slot Grammar: A Syste...[References]None1.525625
2[0.10043394, 0.00652478, 0.011601829, -0.06390...passage using semantic and/or syntactic edges:...[3 Syntactic-Semantic Graphs]None1.569923
3[0.025994677, 0.08402823, 0.03268827, -0.03727...In using this algorithm, we have encountered a...[4 Algorithm]None1.576838
4[0.050165094, 0.08015387, 0.035965856, 0.00846...word order) are more aggressive in what they c...[5 Evaluation and Conclusions]None1.580265
\n", "
" ], "text/plain": [ " vector \\\n", "0 [-0.025746439, 0.03888134, 0.0033668755, -0.03... \n", "1 [0.04400234, -0.034766007, -0.00025527124, 0.0... \n", "2 [0.10043394, 0.00652478, 0.011601829, -0.06390... \n", "3 [0.025994677, 0.08402823, 0.03268827, -0.03727... \n", "4 [0.050165094, 0.08015387, 0.035965856, 0.00846... \n", "\n", " text \\\n", "0 3. Forbus, K. and Oblinger, D. (1990). Making ... \n", "1 4. McCord, M. C. (1990). Slot Grammar: A Syste... \n", "2 passage using semantic and/or syntactic edges:... \n", "3 In using this algorithm, we have encountered a... \n", "4 word order) are more aggressive in what they c... \n", "\n", " headings captions _distance \n", "0 [References] None 0.332435 \n", "1 [References] None 1.525625 \n", "2 [3 Syntactic-Semantic Graphs] None 1.569923 \n", "3 [4 Algorithm] None 1.576838 \n", "4 [5 Evaluation and Conclusions] None 1.580265 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def make_lancedb_index(db_uri, index_name, chunks: list[DocChunk], embedding_model):\n", " db = lancedb.connect(db_uri)\n", " data = []\n", " for chunk in chunks:\n", " embeddings = embedding_model.encode(chunk.get_text_for_embedding())\n", " data_item = {\n", " \"vector\": embeddings,\n", " \"text\": chunk.text,\n", " \"headings\": chunk.meta.headings,\n", " \"captions\": chunk.meta.captions,\n", " }\n", " data.append(data_item)\n", " tbl = db.create_table(index_name, data=data, exist_ok=True)\n", " return tbl\n", "\n", "\n", "db_uri = str(Path(mkdtemp()) / \"docling.db\") # or set as needed\n", "index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n", "\n", "sample_query = \"Making SME greedy and pragmatic\"\n", "sample_embedding = embed_model.encode(sample_query)\n", "results = index.search(sample_embedding).limit(5)\n", "\n", "results.to_pandas()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }