# Hybrid Chunking

In [1]:
%pip install -qU 'docling-core[chunking]' sentence-transformers transformers lancedb

Note: you may need to restart the kernel to use updated packages.


## Conversion

In [2]:
from docling.document_converter import DocumentConverter

DOC_SOURCE = "../../tests/data/md/wiki.md"

doc = DocumentConverter().convert(source=DOC_SOURCE).document

## Chunking

Notice how `tokenizer` and `embed_model` further below are single-sourced from `EMBED_MODEL_ID`.

This is important for making sure the chunker and the embedding model are using the same tokenizer.

In [3]:
from transformers import AutoTokenizer

from docling.chunking import HybridChunker

EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_TOKENS = 64

tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)

chunker = HybridChunker(
    tokenizer=tokenizer,  # can also just pass model name instead of tokenizer instance
    max_tokens=MAX_TOKENS,  # optional, by default derived from `tokenizer`
    # merge_peers=True,  # optional, defaults to True
)
chunk_iter = chunker.chunk(dl_doc=doc)
chunks = list(chunk_iter)

Points to notice:
- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)
- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)
- Where possible, we merge undersized peer chunks (see chunk 0)
- "Tail" chunks trailing right after merges may still be undersized (see chunk 8)

In [4]:
for i, chunk in enumerate(chunks):
    print(f"=== {i} ===")
    txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))
    print(f"chunk.text ({txt_tokens} tokens):\n{repr(chunk.text)}")

    ser_txt = chunker.serialize(chunk=chunk)
    ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))
    print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{repr(ser_txt)}")

    print()

=== 0 ===
chunk.text (55 tokens):
'International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average.'
chunker.serialize(chunk) (56 tokens):
'IBM\nInternational Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average.'

=== 1 ===
chunk.text (45 tokens):
'IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.'
chunker.serialize(chunk) (46 t

## Vector Retrieval

In [5]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer(EMBED_MODEL_ID)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
from pathlib import Path
from tempfile import mkdtemp

import lancedb


def make_lancedb_index(db_uri, index_name, chunks, embedding_model):
    db = lancedb.connect(db_uri)
    data = []
    for chunk in chunks:
        embeddings = embedding_model.encode(chunker.serialize(chunk=chunk))
        data_item = {
            "vector": embeddings,
            "text": chunk.text,
            "headings": chunk.meta.headings,
            "captions": chunk.meta.captions,
        }
        data.append(data_item)
    tbl = db.create_table(index_name, data=data, exist_ok=True)
    return tbl


db_uri = str(Path(mkdtemp()) / "docling.db")
index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)

sample_query = "invent"
sample_embedding = embed_model.encode(sample_query)
results = index.search(sample_embedding).limit(5)

results.to_pandas()

Unnamed: 0,vector,text,headings,captions,_distance
0,"[-0.1269039, -0.01948185, -0.07718097, -0.1116...","language, and the UPC barcode. The company has...",[IBM],,1.164613
1,"[-0.10198064, 0.0055981805, -0.05095279, -0.13...",IBM originated with several technological inno...,"[IBM, 1910s–1950s]",,1.245144
2,"[-0.057121325, -0.034115084, -0.018113216, -0....",As one of the world's oldest and largest techn...,[IBM],,1.355586
3,"[-0.04429054, -0.058111433, -0.009330196, -0.0...",IBM is the largest industrial research organiz...,[IBM],,1.398617
4,"[-0.11920792, 0.053496413, -0.042391937, -0.03...",Awards.[16],[IBM],,1.446295
