In [1]:
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import HierarchicalChunker, BaseChunk, BaseMeta, BaseChunker
from docling_core.types.doc.document import DocItem
from docling_core.types import DoclingDocument

import semchunk

from pydantic import Field, PositiveInt
from typing import Optional, Iterator

from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
conv_res = DocumentConverter().convert("http://bill.murdocks.org/iccbr2011murdock_web.pdf")
doc = conv_res.document
chunks = list(HierarchicalChunker().chunk(doc))

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 44567.57it/s]


In [3]:
i = 0
for c in chunks:
    # Finding the block of text containing the big bulletted list starting with "Local" because that's useful for testing the handling of lists.
    if "Local" in c.meta.doc_items[0].text:
        print(i)
    i += 1

19


In [4]:
chunks[19].text

'\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\n\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\n\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).\n\u

In [5]:
chunks[19].meta.doc_items

[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=<DocItemLabel.LIST_ITEM: 'list_item'>, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 363))], orig='\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet

In [6]:
doc.name

'iccbr2011murdock_web'

In [7]:
EMBED_MODEL_ID = 'sentence-transformers/all-MiniLM-L6-v2'

In [8]:
TOKENIZER = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)

In [9]:
TOKENIZER.tokenize('I like Ike.\nBob likes Joe.')

['i', 'like', 'ike', '.', 'bob', 'likes', 'joe', '.']

In [10]:
len(TOKENIZER.tokenize('I like Ike.\nBob likes Joe.'))

8

In [11]:
def count_tokens(text, tokenizer):
    if text == None:
        return 0
    elif isinstance(text, list):
        total = 0
        for t in text:
            total += count_tokens(t, tokenizer)
        return total
    return len(tokenizer.tokenize(text, max_length=None))

In [12]:
count_tokens(['I like Ike.\nBob likes Joe.'], TOKENIZER)

8

In [13]:
def make_splitter(tokenizer, chunk_size):
    return semchunk.chunkerify(tokenizer, chunk_size)

In [14]:
s = make_splitter(TOKENIZER, 2)
s.chunk('I like Ike.\nBob likes Joe.')

['I like', 'Ike.', 'Bob likes', 'Joe.']

In [15]:
def doc_chunk_length(doc_chunk, title_length, tokenizer):
    text_length = count_tokens(doc_chunk.text, tokenizer)
    # Note that count_tokens handles None and lists, making this code simpler:
    headings_length = count_tokens(doc_chunk.meta.headings, tokenizer)
    captions_length = count_tokens(doc_chunk.meta.captions, tokenizer)
    total = title_length + text_length + headings_length + captions_length
    return {
        'total': total,
        'text': text_length,
        'other': total - text_length
    }     

In [16]:
doc_chunk_length(chunks[19], 1, TOKENIZER)

{'total': 307, 'text': 304, 'other': 3}

In [17]:
# Simplified version of DocMeta from the Hierarchical Chunker.  We can't just use that structure because the attributes are private_attributes as tracked by pydantic.

class DocumentMeta(BaseMeta):
    """Data model for chunk metadata."""

    doc_items: list[DocItem] = Field(
        min_length=1
    )
    headings: Optional[list[str]] = Field(
        default=None,
        min_length=1
    )
    captions: Optional[list[str]] = Field(
        default=None,
        min_length=1
    )


class DocumentChunk(BaseChunk):
    """Data model for chunks."""

    meta: BaseMeta

In [18]:
def make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end):
    meta=DocumentMeta(doc_items=doc_chunk.meta.doc_items[window_start:window_end+1],
                      headings=doc_chunk.meta.headings,
                      captions=doc_chunk.meta.captions)
    new_chunk = DocumentChunk(text=window_text, meta=meta)
    return new_chunk


def merge_text(t1, t2):
    if t1 == "":
        return t2
    elif t2 == "":
        return t1
    else:
        return t1 + "\n" + t2


def split_by_doc_items(doc_chunk, title_length, tokenizer, chunk_size):
    if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:
        return [doc_chunk]
    length = doc_chunk_length(doc_chunk, title_length, tokenizer)
    if length['total'] <= chunk_size:
        return [doc_chunk]
    else:
        chunks = []
        window_start = 0
        window_end = 0
        window_text = ""
        window_text_length = 0
        other_length = length['other']
        l = len(doc_chunk.meta.doc_items)
        while window_end < l:
            doc_item = doc_chunk.meta.doc_items[window_end]
            text = doc_item.text
            text_length = count_tokens(text, tokenizer)
            if text_length + window_text_length + other_length < chunk_size and window_end < l - 1:
                # Still room left to add more to this chunk AND still at least one item left
                window_end += 1
                window_text_length += text_length
                window_text = merge_text(window_text, text)
            elif text_length + window_text_length + other_length < chunk_size:
                # All the items in the window fit into the chunk and there are no other items left
                window_text = merge_text(window_text, text)
                new_chunk = make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end)
                chunks.append(new_chunk)
                window_end = l
            elif window_start == window_end:
                # Only one item in the window and it doesn't fit into the chunk.  So we'll just make it a chunk for now and it will get split in the plain text splitter.
                window_text = merge_text(window_text, text)
                new_chunk = make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end)
                chunks.append(new_chunk)
                window_start = window_end+1
                window_end = window_start
                window_text = ''
                window_text_length = 0
            else:
                # Multiple items in the window but they don't fit into the chunk.  However, the existing items must have fit or we wouldn't have gotten here.
                # So we put everything but the last item into the chunk and then start a new window INCLUDING the current window end.
                new_chunk = make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end-1)
                chunks.append(new_chunk)
                window_start = window_end
                window_text = ''
                window_text_length = 0
        return chunks

In [19]:
split_chunks = split_by_doc_items(chunks[19], 5, TOKENIZER, 300)
split_chunks

[DocumentChunk(text='\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\n\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\n\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the ca

In [20]:
print('Item lengths')

for item in chunks[19].meta.doc_items:
    count = count_tokens(item.text, TOKENIZER)
    print(item.text)
    print(count)

print('Chunk lengths')

for c in split_chunks:
    count = count_tokens(c.text, TOKENIZER)
    print(c.text)
    print(count)

Item lengths
 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.
84
 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.
85
 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).
33

In [21]:
def split_using_plain_text(doc_chunk, title_length, tokenizer, plain_text_splitter, chunk_size):
    lengths = doc_chunk_length(doc_chunk, title_length, tokenizer)
    if lengths['total'] <= chunk_size:
        return [doc_chunk]
    else:
        # How much room is there for text after subtracting out the title, headers, and captions:
        available_length = chunk_size - title_length - lengths['other']
        if available_length <= 0:
            raise ValueError("Title, headers, and captions for this chunk are longer than the total amount of size for the chunk.  This is not supported now.")
        text = doc_chunk.text
        segments = plain_text_splitter.chunk(text)
        chunks = []
        for s in segments:
           new_chunk = DocumentChunk(text=s, meta=doc_chunk.meta)
           chunks.append(new_chunk)
        return chunks

In [22]:
# Normally we'd have the same chunk_size for this step too, but for testing I am taking the first output from the previous step and splitting it into even smaller chunks.

chunk_size = 50
plain_text_splitter = make_splitter(TOKENIZER, chunk_size)
resplit_chunks = split_using_plain_text(split_chunks[0], 5, TOKENIZER, plain_text_splitter, chunk_size)
resplit_chunks 

[DocumentChunk(text='\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=<DocItemLabel.LIST_ITEM: 'list_item'>, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 363))], orig='\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\uf0b7

In [23]:
for c in resplit_chunks:
    count = count_tokens(c.text, TOKENIZER)
    print(c.text)
    print(count)

 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of
50
resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.
34
 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local
50
matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.
35
 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).
33


In [24]:
def merge_chunks_with_matching_metadata(chunks, title_length, tokenizer, chunk_size):
    output_chunks = []
    window_start = 0
    window_end = 0
    l = len(chunks)
    while window_end < l:
        chunk = chunks[window_end]
        lengths = doc_chunk_length(chunk, title_length, tokenizer)
        headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
        if window_start == window_end:
            # starting a new block of chunks to potentially merge
            current_headings_and_captions = headings_and_captions
            window_text = chunk.text
            window_other_length = lengths['other']
            window_text_length = lengths['text']
            window_items = chunk.meta.doc_items
            window_end += 1
            first_chunk_of_window = chunk
        elif headings_and_captions == current_headings_and_captions and window_text_length + window_other_length + lengths['text'] <= chunk_size:
                # there is room to include the new chunk so add it to the window and continue
                window_text = merge_text(window_text, chunk.text)
                window_text_length += lengths['text']
                window_items = window_items + chunk.meta.doc_items
                window_end += 1
        else:
            # no more room OR the start of new metadata.  Either way, end the block and use the current window_end as the start of a new block
            if window_start + 1 == window_end:
                # just one chunk so use it as is
                output_chunks.append(first_chunk_of_window)
            else:
                new_meta = DocumentMeta(doc_items=window_items, headings=headings_and_captions[0], captions=headings_and_captions[1])
                new_chunk = DocumentChunk(text=window_text, meta=new_meta)
                output_chunks.append(new_chunk)
            window_start = window_end # no need to reset window_text, etc. because that will be reset in the next iteration in the if window_start == window_end block

    return output_chunks


def merge_chunks_with_mismatching_metadata(chunks, *_):
    # placeholder, for now we're not merging across text with different headings+captions
    # in principal it seems like a good idea for cases where you can merge entire sections
    # but it is not clear what you do about the metadata then because some of it apples to 
    return chunks


def merge_chunks(chunks, title_length, tokenizer, chunk_size):
    # merges as many chunks as possible that have the same headings+captions.
    initial_merged_chunks = merge_chunks_with_matching_metadata(chunks, title_length, tokenizer, chunk_size)
    # merges chunks with different headings+captions.  This is later so that merges within a section or other grouping are preferred.
    final_merged_chunks = merge_chunks_with_mismatching_metadata(initial_merged_chunks, title_length, tokenizer, chunk_size)
    return final_merged_chunks

In [25]:
def adjust_chunks_for_fixed_size(doc, original_chunks, tokenizer, splitter, chunk_size):
    title = doc.name
    title_length = count_tokens(title, tokenizer)
    chunks_after_splitting_by_items = []
    for chunk in original_chunks:
        chunk_split_by_doc_items = split_by_doc_items(chunk, title_length, tokenizer, chunk_size)
        chunks_after_splitting_by_items.extend(chunk_split_by_doc_items)
    chunks_after_splitting_recursively = []
    for chunk in chunks_after_splitting_by_items:
        chunk_split_recursively = split_using_plain_text(chunk, title_length, tokenizer, splitter, chunk_size)
        chunks_after_splitting_recursively.extend(chunk_split_recursively)
    chunks_afer_merging = merge_chunks(chunks_after_splitting_recursively, title_length, tokenizer,  chunk_size)
    return chunks_afer_merging

In [26]:
chunk_size = 256
test_chunks = chunks[19:25]
adjusted = adjust_chunks_for_fixed_size(doc, test_chunks, TOKENIZER, make_splitter(TOKENIZER, chunk_size), chunk_size)
print(adjusted)

[DocumentChunk(text='\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\n\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\n\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the ca

In [27]:
print('Original chunks')

for chunk in test_chunks:
    count = count_tokens(chunk.text, TOKENIZER)
    print(chunk.text)
    print(count)

print('Adjusted chunks')

for c in adjusted:
    count = count_tokens(c.text, TOKENIZER)
    print(c.text)
    print(count)

Original chunks
 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.
 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.
 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).
 Mat

In [28]:
class MaxTokenLimitingChunkerWithMerging(BaseChunker):
    inner_chunker: BaseChunker = HierarchicalChunker()
    max_tokens: PositiveInt = 512
    embedding_model_id: str
    def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:
        preliminary_chunks = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)
        tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_id)
        splitter = make_splitter(tokenizer, self.max_tokens)
        output_chunks = adjust_chunks_for_fixed_size(doc, preliminary_chunks, tokenizer, splitter, self.max_tokens)
        return iter(output_chunks)

In [29]:
chunker = MaxTokenLimitingChunkerWithMerging(max_tokens=64, embedding_model_id=EMBED_MODEL_ID)
final_output_chunks = chunker.chunk(dl_doc=doc)


i = 0
for chunk in final_output_chunks:
    print(chunk.text)    
    print(count_tokens(chunk.text, TOKENIZER))
    i += 1
    if i > 10:
        break

murdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598
33
Abstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct answer to the question. Recognizing
64
whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied to determine similarity between content in questions and passages. That algorithm
64
is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.
26
Watson is a