From 7ed4d371c552ac7a5aad3345c947467a4086ea6b Mon Sep 17 00:00:00 2001 From: Bill Murdock Date: Fri, 1 Nov 2024 13:22:54 -0400 Subject: [PATCH] Update advanced_chunking_with_merging Signed-off-by: Bill Murdock --- .../advanced_chunking_with_merging.ipynb | 370 +++++++++++------- 1 file changed, 238 insertions(+), 132 deletions(-) diff --git a/docs/examples/advanced_chunking_with_merging.ipynb b/docs/examples/advanced_chunking_with_merging.ipynb index 0461a5e0..b4ee9ddb 100644 --- a/docs/examples/advanced_chunking_with_merging.ipynb +++ b/docs/examples/advanced_chunking_with_merging.ipynb @@ -2,30 +2,24 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 32, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/bmurdock/.pyenv/versions/bmurdock-pyenv-virtualenv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ - "from docling.document_converter import DocumentConverter\n", - "from docling_core.transforms.chunker import HierarchicalChunker, BaseChunk, BaseMeta, BaseChunker\n", - "from docling_core.types.doc.document import DocItem\n", - "from docling_core.types import DoclingDocument\n", - "\n", + "from typing import Iterator\n", "import semchunk\n", + "from docling_core.transforms.chunker import (\n", + " BaseChunk,\n", + " BaseChunker,\n", + " HierarchicalChunker\n", + ")\n", + "from docling_core.types import DoclingDocument\n", + "from pydantic import PositiveInt\n", + "from transformers import AutoTokenizer\n", + "from sentence_transformers import SentenceTransformer\n", "\n", - "from pydantic import Field, PositiveInt\n", - "from typing import Optional, Iterator\n", - "\n", - "from transformers import AutoTokenizer" + "from docling.document_converter import DocumentConverter\n", + "import lancedb" ] }, { @@ -37,12 +31,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 44567.57it/s]\n" + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 83514.90it/s]\n" ] } ], "source": [ - "conv_res = DocumentConverter().convert(\"http://bill.murdocks.org/iccbr2011murdock_web.pdf\")\n", + "conv_res = DocumentConverter().convert(\n", + " \"http://bill.murdocks.org/iccbr2011murdock_web.pdf\"\n", + ")\n", "doc = conv_res.document\n", "chunks = list(HierarchicalChunker().chunk(doc))" ] @@ -138,7 +134,7 @@ "metadata": {}, "outputs": [], "source": [ - "EMBED_MODEL_ID = 'sentence-transformers/all-MiniLM-L6-v2'" + "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"" ] }, { @@ -167,7 +163,7 @@ } ], "source": [ - "TOKENIZER.tokenize('I like Ike.\\nBob likes Joe.')" + "TOKENIZER.tokenize(\"I like Ike.\\nBob likes Joe.\")" ] }, { @@ -187,7 +183,7 @@ } ], "source": [ - "len(TOKENIZER.tokenize('I like Ike.\\nBob likes Joe.'))" + "len(TOKENIZER.tokenize(\"I like Ike.\\nBob likes Joe.\"))" ] }, { @@ -196,8 +192,8 @@ "metadata": {}, "outputs": [], "source": [ - "def count_tokens(text, tokenizer):\n", - " if text == None:\n", + "def count_tokens(text: list[str] | None, tokenizer):\n", + " if text is None:\n", " return 0\n", " elif isinstance(text, list):\n", " total = 0\n", @@ -224,7 +220,7 @@ } ], "source": [ - "count_tokens(['I like Ike.\\nBob likes Joe.'], TOKENIZER)" + "count_tokens([\"I like Ike.\\nBob likes Joe.\"], TOKENIZER)" ] }, { @@ -255,7 +251,7 @@ ], "source": [ "s = make_splitter(TOKENIZER, 2)\n", - "s.chunk('I like Ike.\\nBob likes Joe.')" + "s.chunk(\"I like Ike.\\nBob likes Joe.\")" ] }, { @@ -264,17 +260,16 @@ "metadata": {}, "outputs": [], "source": [ - "def doc_chunk_length(doc_chunk, title_length, tokenizer):\n", + "from docling_core.transforms.chunker.hierarchical_chunker import DocChunk\n", + "\n", + "\n", + "def doc_chunk_length(doc_chunk: DocChunk, title_length: int, tokenizer):\n", " text_length = count_tokens(doc_chunk.text, tokenizer)\n", " # Note that count_tokens handles None and lists, making this code simpler:\n", " headings_length = count_tokens(doc_chunk.meta.headings, tokenizer)\n", " captions_length = count_tokens(doc_chunk.meta.captions, tokenizer)\n", " total = title_length + text_length + headings_length + captions_length\n", - " return {\n", - " 'total': total,\n", - " 'text': text_length,\n", - " 'other': total - text_length\n", - " } " + " return {\"total\": total, \"text\": text_length, \"other\": total - text_length}" ] }, { @@ -303,41 +298,19 @@ "metadata": {}, "outputs": [], "source": [ - "# Simplified version of DocMeta from the Hierarchical Chunker. We can't just use that structure because the attributes are private_attributes as tracked by pydantic.\n", + "from docling_core.transforms.chunker import DocMeta, HierarchicalChunker\n", + "from docling_core.transforms.chunker.hierarchical_chunker import DocChunk\n", "\n", - "class DocumentMeta(BaseMeta):\n", - " \"\"\"Data model for chunk metadata.\"\"\"\n", "\n", - " doc_items: list[DocItem] = Field(\n", - " min_length=1\n", + "def make_chunk_from_doc_items(\n", + " doc_chunk: DocChunk, window_text: str, window_start: int, window_end: int\n", + "):\n", + " meta = DocMeta(\n", + " doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],\n", + " headings=doc_chunk.meta.headings,\n", + " captions=doc_chunk.meta.captions,\n", " )\n", - " headings: Optional[list[str]] = Field(\n", - " default=None,\n", - " min_length=1\n", - " )\n", - " captions: Optional[list[str]] = Field(\n", - " default=None,\n", - " min_length=1\n", - " )\n", - "\n", - "\n", - "class DocumentChunk(BaseChunk):\n", - " \"\"\"Data model for chunks.\"\"\"\n", - "\n", - " meta: BaseMeta" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "def make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end):\n", - " meta=DocumentMeta(doc_items=doc_chunk.meta.doc_items[window_start:window_end+1],\n", - " headings=doc_chunk.meta.headings,\n", - " captions=doc_chunk.meta.captions)\n", - " new_chunk = DocumentChunk(text=window_text, meta=meta)\n", + " new_chunk = DocChunk(text=window_text, meta=meta)\n", " return new_chunk\n", "\n", "\n", @@ -350,11 +323,13 @@ " return t1 + \"\\n\" + t2\n", "\n", "\n", - "def split_by_doc_items(doc_chunk, title_length, tokenizer, chunk_size):\n", + "def split_by_doc_items(\n", + " doc_chunk: DocChunk, title_length: int, tokenizer, chunk_size: int\n", + "):\n", " if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n", " return [doc_chunk]\n", " length = doc_chunk_length(doc_chunk, title_length, tokenizer)\n", - " if length['total'] <= chunk_size:\n", + " if length[\"total\"] <= chunk_size:\n", " return [doc_chunk]\n", " else:\n", " chunks = []\n", @@ -362,13 +337,16 @@ " window_end = 0\n", " window_text = \"\"\n", " window_text_length = 0\n", - " other_length = length['other']\n", + " other_length = length[\"other\"]\n", " l = len(doc_chunk.meta.doc_items)\n", " while window_end < l:\n", " doc_item = doc_chunk.meta.doc_items[window_end]\n", " text = doc_item.text\n", " text_length = count_tokens(text, tokenizer)\n", - " if text_length + window_text_length + other_length < chunk_size and window_end < l - 1:\n", + " if (\n", + " text_length + window_text_length + other_length < chunk_size\n", + " and window_end < l - 1\n", + " ):\n", " # Still room left to add more to this chunk AND still at least one item left\n", " window_end += 1\n", " window_text_length += text_length\n", @@ -376,42 +354,48 @@ " elif text_length + window_text_length + other_length < chunk_size:\n", " # All the items in the window fit into the chunk and there are no other items left\n", " window_text = merge_text(window_text, text)\n", - " new_chunk = make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end)\n", + " new_chunk = make_chunk_from_doc_items(\n", + " doc_chunk, window_text, window_start, window_end\n", + " )\n", " chunks.append(new_chunk)\n", " window_end = l\n", " elif window_start == window_end:\n", " # Only one item in the window and it doesn't fit into the chunk. So we'll just make it a chunk for now and it will get split in the plain text splitter.\n", " window_text = merge_text(window_text, text)\n", - " new_chunk = make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end)\n", + " new_chunk = make_chunk_from_doc_items(\n", + " doc_chunk, window_text, window_start, window_end\n", + " )\n", " chunks.append(new_chunk)\n", - " window_start = window_end+1\n", + " window_start = window_end + 1\n", " window_end = window_start\n", - " window_text = ''\n", + " window_text = \"\"\n", " window_text_length = 0\n", " else:\n", " # Multiple items in the window but they don't fit into the chunk. However, the existing items must have fit or we wouldn't have gotten here.\n", " # So we put everything but the last item into the chunk and then start a new window INCLUDING the current window end.\n", - " new_chunk = make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end-1)\n", + " new_chunk = make_chunk_from_doc_items(\n", + " doc_chunk, window_text, window_start, window_end - 1\n", + " )\n", " chunks.append(new_chunk)\n", " window_start = window_end\n", - " window_text = ''\n", + " window_text = \"\"\n", " window_text_length = 0\n", " return chunks" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[DocumentChunk(text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\\n\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\\n\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)),\n", - " DocumentChunk(text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/28', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.39203643798828, t=385.059814453125, r=473.0816345214844, b=312.2467041015625, coord_origin=), charspan=(0, 467))], orig='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None))]" + "[DocChunk(text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\\n\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\\n\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None, origin=None)),\n", + " DocChunk(text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/28', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.39203643798828, t=385.059814453125, r=473.0816345214844, b=312.2467041015625, coord_origin=), charspan=(0, 467))], orig='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None, origin=None))]" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -423,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -450,14 +434,14 @@ } ], "source": [ - "print('Item lengths')\n", + "print(\"Item lengths\")\n", "\n", "for item in chunks[19].meta.doc_items:\n", " count = count_tokens(item.text, TOKENIZER)\n", " print(item.text)\n", " print(count)\n", "\n", - "print('Chunk lengths')\n", + "print(\"Chunk lengths\")\n", "\n", "for c in split_chunks:\n", " count = count_tokens(c.text, TOKENIZER)\n", @@ -467,44 +451,52 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ - "def split_using_plain_text(doc_chunk, title_length, tokenizer, plain_text_splitter, chunk_size):\n", + "def split_using_plain_text(\n", + " doc_chunk: DocChunk,\n", + " title_length: int,\n", + " tokenizer,\n", + " plain_text_splitter,\n", + " chunk_size: int,\n", + "):\n", " lengths = doc_chunk_length(doc_chunk, title_length, tokenizer)\n", - " if lengths['total'] <= chunk_size:\n", + " if lengths[\"total\"] <= chunk_size:\n", " return [doc_chunk]\n", " else:\n", " # How much room is there for text after subtracting out the title, headers, and captions:\n", - " available_length = chunk_size - title_length - lengths['other']\n", + " available_length = chunk_size - title_length - lengths[\"other\"]\n", " if available_length <= 0:\n", - " raise ValueError(\"Title, headers, and captions for this chunk are longer than the total amount of size for the chunk. This is not supported now.\")\n", + " raise ValueError(\n", + " \"Title, headers, and captions for this chunk are longer than the total amount of size for the chunk. This is not supported now.\"\n", + " )\n", " text = doc_chunk.text\n", " segments = plain_text_splitter.chunk(text)\n", " chunks = []\n", " for s in segments:\n", - " new_chunk = DocumentChunk(text=s, meta=doc_chunk.meta)\n", - " chunks.append(new_chunk)\n", + " new_chunk = DocChunk(text=s, meta=doc_chunk.meta)\n", + " chunks.append(new_chunk)\n", " return chunks" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[DocumentChunk(text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)),\n", - " DocumentChunk(text='resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)),\n", - " DocumentChunk(text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)),\n", - " DocumentChunk(text='matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)),\n", - " DocumentChunk(text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None))]" + "[DocChunk(text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None, origin=None)),\n", + " DocChunk(text='resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None, origin=None)),\n", + " DocChunk(text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None, origin=None)),\n", + " DocChunk(text='matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None, origin=None)),\n", + " DocChunk(text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None, origin=None))]" ] }, - "execution_count": 22, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -512,15 +504,18 @@ "source": [ "# Normally we'd have the same chunk_size for this step too, but for testing I am taking the first output from the previous step and splitting it into even smaller chunks.\n", "\n", + "\n", "chunk_size = 50\n", "plain_text_splitter = make_splitter(TOKENIZER, chunk_size)\n", - "resplit_chunks = split_using_plain_text(split_chunks[0], 5, TOKENIZER, plain_text_splitter, chunk_size)\n", - "resplit_chunks " + "resplit_chunks = split_using_plain_text(\n", + " split_chunks[0], 5, TOKENIZER, plain_text_splitter, chunk_size\n", + ")\n", + "resplit_chunks" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -549,7 +544,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -566,27 +561,34 @@ " # starting a new block of chunks to potentially merge\n", " current_headings_and_captions = headings_and_captions\n", " window_text = chunk.text\n", - " window_other_length = lengths['other']\n", - " window_text_length = lengths['text']\n", + " window_other_length = lengths[\"other\"]\n", + " window_text_length = lengths[\"text\"]\n", " window_items = chunk.meta.doc_items\n", " window_end += 1\n", " first_chunk_of_window = chunk\n", - " elif headings_and_captions == current_headings_and_captions and window_text_length + window_other_length + lengths['text'] <= chunk_size:\n", - " # there is room to include the new chunk so add it to the window and continue\n", - " window_text = merge_text(window_text, chunk.text)\n", - " window_text_length += lengths['text']\n", - " window_items = window_items + chunk.meta.doc_items\n", - " window_end += 1\n", + " elif (\n", + " headings_and_captions == current_headings_and_captions\n", + " and window_text_length + window_other_length + lengths[\"text\"] <= chunk_size\n", + " ):\n", + " # there is room to include the new chunk so add it to the window and continue\n", + " window_text = merge_text(window_text, chunk.text)\n", + " window_text_length += lengths[\"text\"]\n", + " window_items = window_items + chunk.meta.doc_items\n", + " window_end += 1\n", " else:\n", " # no more room OR the start of new metadata. Either way, end the block and use the current window_end as the start of a new block\n", " if window_start + 1 == window_end:\n", " # just one chunk so use it as is\n", " output_chunks.append(first_chunk_of_window)\n", " else:\n", - " new_meta = DocumentMeta(doc_items=window_items, headings=headings_and_captions[0], captions=headings_and_captions[1])\n", - " new_chunk = DocumentChunk(text=window_text, meta=new_meta)\n", + " new_meta = DocMeta(\n", + " doc_items=window_items,\n", + " headings=headings_and_captions[0],\n", + " captions=headings_and_captions[1],\n", + " )\n", + " new_chunk = DocChunk(text=window_text, meta=new_meta)\n", " output_chunks.append(new_chunk)\n", - " window_start = window_end # no need to reset window_text, etc. because that will be reset in the next iteration in the if window_start == window_end block\n", + " window_start = window_end # no need to reset window_text, etc. because that will be reset in the next iteration in the if window_start == window_end block\n", "\n", " return output_chunks\n", "\n", @@ -594,21 +596,25 @@ "def merge_chunks_with_mismatching_metadata(chunks, *_):\n", " # placeholder, for now we're not merging across text with different headings+captions\n", " # in principal it seems like a good idea for cases where you can merge entire sections\n", - " # but it is not clear what you do about the metadata then because some of it apples to \n", + " # but it is not clear what you do about the metadata then because some of it apples to\n", " return chunks\n", "\n", "\n", "def merge_chunks(chunks, title_length, tokenizer, chunk_size):\n", " # merges as many chunks as possible that have the same headings+captions.\n", - " initial_merged_chunks = merge_chunks_with_matching_metadata(chunks, title_length, tokenizer, chunk_size)\n", + " initial_merged_chunks = merge_chunks_with_matching_metadata(\n", + " chunks, title_length, tokenizer, chunk_size\n", + " )\n", " # merges chunks with different headings+captions. This is later so that merges within a section or other grouping are preferred.\n", - " final_merged_chunks = merge_chunks_with_mismatching_metadata(initial_merged_chunks, title_length, tokenizer, chunk_size)\n", + " final_merged_chunks = merge_chunks_with_mismatching_metadata(\n", + " initial_merged_chunks, title_length, tokenizer, chunk_size\n", + " )\n", " return final_merged_chunks" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -617,39 +623,47 @@ " title_length = count_tokens(title, tokenizer)\n", " chunks_after_splitting_by_items = []\n", " for chunk in original_chunks:\n", - " chunk_split_by_doc_items = split_by_doc_items(chunk, title_length, tokenizer, chunk_size)\n", + " chunk_split_by_doc_items = split_by_doc_items(\n", + " chunk, title_length, tokenizer, chunk_size\n", + " )\n", " chunks_after_splitting_by_items.extend(chunk_split_by_doc_items)\n", " chunks_after_splitting_recursively = []\n", " for chunk in chunks_after_splitting_by_items:\n", - " chunk_split_recursively = split_using_plain_text(chunk, title_length, tokenizer, splitter, chunk_size)\n", + " chunk_split_recursively = split_using_plain_text(\n", + " chunk, title_length, tokenizer, splitter, chunk_size\n", + " )\n", " chunks_after_splitting_recursively.extend(chunk_split_recursively)\n", - " chunks_afer_merging = merge_chunks(chunks_after_splitting_recursively, title_length, tokenizer, chunk_size)\n", + " chunks_afer_merging = merge_chunks(\n", + " chunks_after_splitting_recursively, title_length, tokenizer, chunk_size\n", + " )\n", " return chunks_afer_merging" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[DocumentChunk(text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\\n\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\\n\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)), DocumentChunk(text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.\\nIn using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/28', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.39203643798828, t=385.059814453125, r=473.0816345214844, b=312.2467041015625, coord_origin=), charspan=(0, 467))], orig='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', enumerated=False, marker='-'), TextItem(self_ref='#/texts/29', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.8291244506836, t=311.80438232421875, r=473.0190734863281, b=240.17425537109375, coord_origin=), charspan=(0, 508))], orig='In using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).', text='In using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).')], headings=['4 Algorithm'], captions=None)), DocChunk(text='Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/30', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.81511688232422, t=239.7743682861328, r=473.023681640625, b=156.86865234375, coord_origin=), charspan=(0, 548))], orig='Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.', text='Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.')], headings=['4 Algorithm'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2576718022335104320, filename='iccbr2011murdock_web.pdf', uri=None))), DocChunk(text='Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/32', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=123.72936248779297, t=666.814453125, r=473.1099853515625, b=523.5120239257812, coord_origin=), charspan=(0, 974))], orig='Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.', text='Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.')], headings=['5 Evaluation and Conclusions'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2576718022335104320, filename='iccbr2011murdock_web.pdf', uri=None))), DocChunk(text='The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/33', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=123.72576141357422, t=522.784423828125, r=473.09423828125, b=355.3149108886719, coord_origin=), charspan=(0, 1115))], orig='The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.', text='The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.')], headings=['5 Evaluation and Conclusions'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2576718022335104320, filename='iccbr2011murdock_web.pdf', uri=None))), DocumentChunk(text='1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.\\n2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.\\n3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .\\n4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/35', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=129.25999450683594, t=309.7490539550781, r=472.53497314453125, b=288.49505615234375, coord_origin=), charspan=(0, 145))], orig='1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.', text='1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/36', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=128.63955688476562, t=287.6390380859375, r=472.5289001464844, b=255.7160186767578, coord_origin=), charspan=(0, 244))], orig='2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.', text='2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/37', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=128.71495056152344, t=254.63900756835938, r=472.8249816894531, b=233.75601196289062, coord_origin=), charspan=(0, 118))], orig='3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .', text='3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .', enumerated=False, marker='-'), ListItem(self_ref='#/texts/38', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=128.1455535888672, t=232.67901611328125, r=472.8609619140625, b=200.1046142578125, coord_origin=), charspan=(0, 241))], orig='4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.', text='4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.', enumerated=False, marker='-')], headings=['References'], captions=None))]\n" + "[DocChunk(text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\\n\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\\n\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None, origin=None)), DocChunk(text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.\\nIn using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/28', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.39203643798828, t=385.059814453125, r=473.0816345214844, b=312.2467041015625, coord_origin=), charspan=(0, 467))], orig='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', enumerated=False, marker='-'), TextItem(self_ref='#/texts/29', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.8291244506836, t=311.80438232421875, r=473.0190734863281, b=240.17425537109375, coord_origin=), charspan=(0, 508))], orig='In using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).', text='In using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).')], headings=['4 Algorithm'], captions=None, origin=None)), DocChunk(text='Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/30', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.81511688232422, t=239.7743682861328, r=473.023681640625, b=156.86865234375, coord_origin=), charspan=(0, 548))], orig='Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.', text='Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.')], headings=['4 Algorithm'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2576718022335104320, filename='iccbr2011murdock_web.pdf', uri=None))), DocChunk(text='Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/32', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=123.72936248779297, t=666.814453125, r=473.1099853515625, b=523.5120239257812, coord_origin=), charspan=(0, 974))], orig='Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.', text='Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.')], headings=['5 Evaluation and Conclusions'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2576718022335104320, filename='iccbr2011murdock_web.pdf', uri=None))), DocChunk(text='The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/33', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=123.72576141357422, t=522.784423828125, r=473.09423828125, b=355.3149108886719, coord_origin=), charspan=(0, 1115))], orig='The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.', text='The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.')], headings=['5 Evaluation and Conclusions'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2576718022335104320, filename='iccbr2011murdock_web.pdf', uri=None))), DocChunk(text='1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.\\n2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.\\n3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .\\n4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/35', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=129.25999450683594, t=309.7490539550781, r=472.53497314453125, b=288.49505615234375, coord_origin=), charspan=(0, 145))], orig='1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.', text='1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/36', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=128.63955688476562, t=287.6390380859375, r=472.5289001464844, b=255.7160186767578, coord_origin=), charspan=(0, 244))], orig='2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.', text='2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/37', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=128.71495056152344, t=254.63900756835938, r=472.8249816894531, b=233.75601196289062, coord_origin=), charspan=(0, 118))], orig='3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .', text='3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .', enumerated=False, marker='-'), ListItem(self_ref='#/texts/38', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=128.1455535888672, t=232.67901611328125, r=472.8609619140625, b=200.1046142578125, coord_origin=), charspan=(0, 241))], orig='4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.', text='4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.', enumerated=False, marker='-')], headings=['References'], captions=None, origin=None))]\n" ] } ], "source": [ "chunk_size = 256\n", "test_chunks = chunks[19:25]\n", - "adjusted = adjust_chunks_for_fixed_size(doc, test_chunks, TOKENIZER, make_splitter(TOKENIZER, chunk_size), chunk_size)\n", + "adjusted = adjust_chunks_for_fixed_size(\n", + " doc, test_chunks, TOKENIZER, make_splitter(TOKENIZER, chunk_size), chunk_size\n", + ")\n", "print(adjusted)" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -699,14 +713,14 @@ } ], "source": [ - "print('Original chunks')\n", + "print(\"Original chunks\")\n", "\n", "for chunk in test_chunks:\n", " count = count_tokens(chunk.text, TOKENIZER)\n", " print(chunk.text)\n", " print(count)\n", "\n", - "print('Adjusted chunks')\n", + "print(\"Adjusted chunks\")\n", "\n", "for c in adjusted:\n", " count = count_tokens(c.text, TOKENIZER)\n", @@ -716,7 +730,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -724,17 +738,20 @@ " inner_chunker: BaseChunker = HierarchicalChunker()\n", " max_tokens: PositiveInt = 512\n", " embedding_model_id: str\n", + "\n", " def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:\n", " preliminary_chunks = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)\n", " tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_id)\n", " splitter = make_splitter(tokenizer, self.max_tokens)\n", - " output_chunks = adjust_chunks_for_fixed_size(doc, preliminary_chunks, tokenizer, splitter, self.max_tokens)\n", + " output_chunks = adjust_chunks_for_fixed_size(\n", + " doc, preliminary_chunks, tokenizer, splitter, self.max_tokens\n", + " )\n", " return iter(output_chunks)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -767,25 +784,114 @@ } ], "source": [ - "chunker = MaxTokenLimitingChunkerWithMerging(max_tokens=64, embedding_model_id=EMBED_MODEL_ID)\n", + "chunker = MaxTokenLimitingChunkerWithMerging(\n", + " max_tokens=64, embedding_model_id=EMBED_MODEL_ID\n", + ")\n", "final_output_chunks = chunker.chunk(dl_doc=doc)\n", "\n", "\n", "i = 0\n", "for chunk in final_output_chunks:\n", - " print(chunk.text) \n", + " print(chunk.text)\n", " print(count_tokens(chunk.text, TOKENIZER))\n", " i += 1\n", " if i > 10:\n", " break" ] }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.01480076, -0.02467153, 0.07359385, -0.0503214 , -0.07260533,\n", + " 0.04160994, 0.0630886 , -0.0369585 , -0.02305009, 0.06851925],\n", + " dtype=float32)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "EMBED_MODEL = SentenceTransformer(EMBED_MODEL_ID)\n", + "embeddings = EMBED_MODEL.encode(\"Frogs are nice!\")\n", + "embeddings[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\\n\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\\n\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).\\n\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.',\n", + " '4 Algorithm']" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def make_text_for_embedding(chunk):\n", + " output = [chunk.text]\n", + " if chunk.meta.headings != None:\n", + " output.extend(chunk.meta.headings)\n", + " if chunk.meta.captions != None:\n", + " output.extend(chunk.meta.captions)\n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "def make_lancedb_index(index_location, index_name, chunks, embedding_model):\n", + " db = lancedb.connect(index_location)\n", + " data = []\n", + " for chunk in chunks:\n", + " text_for_embedding = make_text_for_embedding(chunk)\n", + " embeddings = embedding_model.encode(text_for_embedding)\n", + " data_item = {\n", + " \"vector\": embeddings,\n", + " \"text\": chunk.text,\n", + " \"headings\": chunk.meta.headings,\n", + " \"captions\": chunk.meta.captions\n", + " }\n", + " data.append(data_item)\n", + "\n", + " tbl = db.create_table(index_name, data=data)\n", + " return tbl" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "index = make_lancedb_index(\"data/lancedb\", doc.name, chunks, embedding_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_query = \"Making SME greedy and pragmatic\"\n", + "sample_embedding = EMBED_MODEL.encode(sample_query)" + ] } ], "metadata": {