mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
889 lines
67 KiB
Plaintext
889 lines
67 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Basic RAG pipeline with LlamaIndex"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# requirements for this example:\n",
|
|
"%pip install -qq docling docling-core python-dotenv llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus jsonpath-ng"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import os\n",
|
|
"from tempfile import TemporaryDirectory\n",
|
|
"\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"from pydantic import TypeAdapter\n",
|
|
"from rich.console import Console\n",
|
|
"from rich.pretty import pprint\n",
|
|
"\n",
|
|
"console = Console()\n",
|
|
"\n",
|
|
"\n",
|
|
"load_dotenv()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import warnings\n",
|
|
"\n",
|
|
"warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
|
|
"warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## LlamaIndex extensions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Below we define our framework extensions:\n",
|
|
"- `DoclingPDFReader` which will be used to create LlamaIndex documents, and\n",
|
|
"- `HierarchicalJSONNodeParser`, which will be used to create LlamaIndex nodes out of the documents"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"```mermaid\n",
|
|
"flowchart LR\n",
|
|
" subgraph LI extensions\n",
|
|
" direction LR\n",
|
|
" Reader -->|LI Documents| Parser\n",
|
|
" end\n",
|
|
" Reader(DoclingPDFReader)\n",
|
|
" Parser(HierarchicalJSONNodeParser)\n",
|
|
" Encoder(EmbedModel)\n",
|
|
" Store(VectorStore)\n",
|
|
" Parser -->|LI Nodes| Encoder -->|LI Nodes| Store\n",
|
|
"```"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from enum import Enum\n",
|
|
"from typing import Iterable\n",
|
|
"\n",
|
|
"from llama_index.core.readers.base import BasePydanticReader\n",
|
|
"from llama_index.core.schema import Document as LIDocument\n",
|
|
"from pydantic import BaseModel\n",
|
|
"\n",
|
|
"from docling.document_converter import DocumentConverter\n",
|
|
"\n",
|
|
"\n",
|
|
"class DocumentMetadata(BaseModel):\n",
|
|
" dl_doc_hash: str\n",
|
|
"\n",
|
|
"\n",
|
|
"class DoclingPDFReader(BasePydanticReader):\n",
|
|
" class ParseType(str, Enum):\n",
|
|
" MARKDOWN = \"markdown\"\n",
|
|
" JSON = \"json\"\n",
|
|
"\n",
|
|
" parse_type: ParseType = ParseType.MARKDOWN\n",
|
|
"\n",
|
|
" def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]:\n",
|
|
" file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
|
|
" converter = DocumentConverter()\n",
|
|
" for source in file_paths:\n",
|
|
" dl_doc = converter.convert_single(source).output\n",
|
|
" match self.parse_type:\n",
|
|
" case self.ParseType.MARKDOWN:\n",
|
|
" text = dl_doc.export_to_markdown()\n",
|
|
" case self.ParseType.JSON:\n",
|
|
" text = dl_doc.model_dump_json()\n",
|
|
" case _:\n",
|
|
" raise RuntimeError(\n",
|
|
" f\"Unexpected parse type encountered: {self.parse_type}\"\n",
|
|
" )\n",
|
|
" excl_metadata_keys = [\"dl_doc_hash\"]\n",
|
|
" li_doc = LIDocument(\n",
|
|
" doc_id=dl_doc.file_info.document_hash,\n",
|
|
" text=text,\n",
|
|
" excluded_embed_metadata_keys=excl_metadata_keys,\n",
|
|
" excluded_llm_metadata_keys=excl_metadata_keys,\n",
|
|
" )\n",
|
|
" li_doc.metadata = DocumentMetadata(\n",
|
|
" dl_doc_hash=dl_doc.file_info.document_hash,\n",
|
|
" ).model_dump()\n",
|
|
" yield li_doc"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from typing import Any, Iterable, Sequence\n",
|
|
"\n",
|
|
"from docling_core.transforms.chunker import HierarchicalChunker\n",
|
|
"from docling_core.types import Document as DLDocument\n",
|
|
"from llama_index.core import Document as LIDocument\n",
|
|
"from llama_index.core.node_parser.interface import NodeParser\n",
|
|
"from llama_index.core.schema import (\n",
|
|
" BaseNode,\n",
|
|
" NodeRelationship,\n",
|
|
" RelatedNodeType,\n",
|
|
" TextNode,\n",
|
|
")\n",
|
|
"from llama_index.core.utils import get_tqdm_iterable\n",
|
|
"\n",
|
|
"\n",
|
|
"class NodeMetadata(BaseModel):\n",
|
|
" path: str\n",
|
|
"\n",
|
|
"\n",
|
|
"class HierarchicalJSONNodeParser(NodeParser):\n",
|
|
"\n",
|
|
" include_metadata: bool = False\n",
|
|
"\n",
|
|
" def _parse_nodes(\n",
|
|
" self,\n",
|
|
" nodes: Sequence[BaseNode],\n",
|
|
" show_progress: bool = False,\n",
|
|
" **kwargs: Any,\n",
|
|
" ) -> list[BaseNode]:\n",
|
|
" nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(\n",
|
|
" items=nodes, show_progress=show_progress, desc=\"Parsing nodes\"\n",
|
|
" )\n",
|
|
" all_nodes: list[BaseNode] = []\n",
|
|
" chunker = HierarchicalChunker()\n",
|
|
" for input_node in nodes_with_progress:\n",
|
|
" li_doc = LIDocument.model_validate(input_node)\n",
|
|
" dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())\n",
|
|
" chunk_iter = chunker.chunk(dl_doc=dl_doc)\n",
|
|
" for chunk in chunk_iter:\n",
|
|
" rels: dict[NodeRelationship, RelatedNodeType] = {\n",
|
|
" NodeRelationship.SOURCE: li_doc.as_related_node_info(),\n",
|
|
" }\n",
|
|
" excl_metadata_keys = [\"path\"]\n",
|
|
" node = TextNode(\n",
|
|
" text=chunk.text,\n",
|
|
" excluded_embed_metadata_keys=excl_metadata_keys,\n",
|
|
" excluded_llm_metadata_keys=excl_metadata_keys,\n",
|
|
" relationships=rels,\n",
|
|
" )\n",
|
|
" node.metadata = NodeMetadata(\n",
|
|
" path=chunk.path,\n",
|
|
" ).model_dump()\n",
|
|
" all_nodes.append(node)\n",
|
|
" return all_nodes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Reader and node parser"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Using JSON"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"To leverage Docling's rich document structure format, we namely set the parse type to JSON and use a HierarchicalJSONNodeParser accordingly:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)\n",
|
|
"node_parser = HierarchicalJSONNodeParser()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Using Markdown"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# from llama_index.core.node_parser import MarkdownNodeParser\n",
|
|
"\n",
|
|
"# reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n",
|
|
"# node_parser = MarkdownNodeParser()\n",
|
|
"# transformations = [node_parser]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Transformations"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Our transformations currently include the `node_parser`:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"transformations = [node_parser]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"One can include add more transformations, e.g. further chunking based on text size / overlap, as shown below:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# from llama_index.core.node_parser import TokenTextSplitter\n",
|
|
"\n",
|
|
"# splitter = TokenTextSplitter(\n",
|
|
"# chunk_size=1024,\n",
|
|
"# chunk_overlap=20,\n",
|
|
"# )\n",
|
|
"# transformations.append(splitter)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Embed model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
|
|
"\n",
|
|
"embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Vector store"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"INGEST = True # whether to ingest from scratch or reuse an existing vector store"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
|
"To disable this warning, you can either:\n",
|
|
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
|
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
|
"\n",
|
|
"MILVUS_URL = os.environ.get(\n",
|
|
" \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
|
|
")\n",
|
|
"MILVUS_COLL_NAME = os.environ.get(\"MILVUS_COLL_NAME\", \"basic_llamaindex_pipeline\")\n",
|
|
"MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get(\"MILVUS_KWARGS\", \"{}\"))\n",
|
|
"vector_store = MilvusVectorStore(\n",
|
|
" uri=MILVUS_URL,\n",
|
|
" collection_name=MILVUS_COLL_NAME,\n",
|
|
" dim=len(embed_model.get_text_embedding(\"hi\")),\n",
|
|
" overwrite=INGEST,\n",
|
|
" **MILVUS_KWARGS,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "b79e2579c1a544159099e372010f2692",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #00ff00; text-decoration-color: #00ff00\">─────────────────────────────────────────────── </span>example `Document`:<span style=\"color: #00ff00; text-decoration-color: #00ff00\"> ───────────────────────────────────────────────</span>\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[92m─────────────────────────────────────────────── \u001b[0mexample `Document`:\u001b[92m ───────────────────────────────────────────────\u001b[0m\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">[</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Document</span><span style=\"font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span><span style=\"font-weight: bold\">}</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{}</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">171405</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">)</span>\n",
|
|
"<span style=\"font-weight: bold\">]</span>\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[1m[\u001b[0m\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[1;35mDocument\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"_name\":\"\",\"type\":\"pdf-document\",\"description\":\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"'\u001b[0m+\u001b[1;36m171405\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[1m)\u001b[0m\n",
|
|
"\u001b[1m]\u001b[0m\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"num of Nodes (chunks): 76\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #00ff00; text-decoration-color: #00ff00\">────────────────────────────────────────────── </span>example Node <span style=\"font-weight: bold\">(</span>chunk<span style=\"font-weight: bold\">)</span>:<span style=\"color: #00ff00; text-decoration-color: #00ff00\"> ──────────────────────────────────────────────</span>\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[92m────────────────────────────────────────────── \u001b[0mexample Node \u001b[1m(\u001b[0mchunk\u001b[1m)\u001b[0m:\u001b[92m ──────────────────────────────────────────────\u001b[0m\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextNode</span><span style=\"font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'715fc3ff-9d97-45fd-91bf-f57e3ea3ac94'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[7]'</span><span style=\"font-weight: bold\">}</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">NodeRelationship.SOURCE:</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.DOCUMENT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'4'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'70041f7130ef2e27cc1e6156ffa78d3a1f3e0c3cb7763cc314cda6a47a029350'</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.PREVIOUS: </span><span style=\"color: #008000; text-decoration-color: #008000\">'2'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'06c2ded5-e46a-4278-92ea-eed9bc78b9f6'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"font-weight: bold\">></span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">}</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'c51973e676e01da1fc3504107653ce74d198dbafa50eedd32cf5d5e3c7a82e0b'</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">}</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'ABSTRACT\\nAccurate document layout analysis is a key requirement for highquality PDF document conversion. '</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1499</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
|
|
"<span style=\"font-weight: bold\">)</span>\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'715fc3ff-9d97-45fd-91bf-f57e3ea3ac94'\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m7\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'path'\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'path'\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[1m<\u001b[0m\u001b[1;95mNodeRelationship.SOURCE:\u001b[0m\u001b[39m \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.DOCUMENT: \u001b[0m\u001b[32m'4'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'70041f7130ef2e27cc1e6156ffa78d3a1f3e0c3cb7763cc314cda6a47a029350'\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[39m<NodeRelationship.PREVIOUS: \u001b[0m\u001b[32m'2'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'06c2ded5-e46a-4278-92ea-eed9bc78b9f6'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[1m>\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mhash\u001b[0m=\u001b[32m'c51973e676e01da1fc3504107653ce74d198dbafa50eedd32cf5d5e3c7a82e0b'\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'ABSTRACT\\nAccurate document layout analysis is a key requirement for highquality PDF document conversion. '\u001b[0m+\u001b[1;36m1499\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n",
|
|
"\u001b[1m)\u001b[0m\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from llama_index.core import StorageContext, VectorStoreIndex\n",
|
|
"from llama_index.core.ingestion import IngestionPipeline\n",
|
|
"\n",
|
|
"if INGEST:\n",
|
|
" # in this case we ingest the data into the vector store\n",
|
|
" docs = reader.load_data(\n",
|
|
" file_path=\"https://arxiv.org/pdf/2206.01062\", # DocLayNet paper\n",
|
|
" )\n",
|
|
" console.rule(f\"example `Document`:\")\n",
|
|
" pprint(docs, max_length=1, max_string=50, max_depth=4)\n",
|
|
" pipeline = IngestionPipeline(\n",
|
|
" transformations=transformations,\n",
|
|
" vector_store=vector_store,\n",
|
|
" )\n",
|
|
" nodes = pipeline.run(documents=docs)\n",
|
|
" print(f\"num of Nodes (chunks): {len(nodes)}\")\n",
|
|
" console.rule(f\"example Node (chunk):\")\n",
|
|
" pprint(\n",
|
|
" nodes[6],\n",
|
|
" max_length=2,\n",
|
|
" max_string=105,\n",
|
|
" max_depth=3,\n",
|
|
" )\n",
|
|
" index = VectorStoreIndex(nodes=nodes, embed_model=embed_model)\n",
|
|
"\n",
|
|
"else:\n",
|
|
" # in this case we just load the vector store index\n",
|
|
" index = VectorStoreIndex.from_vector_store(\n",
|
|
" vector_store=vector_store,\n",
|
|
" embed_model=embed_model,\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## LLM"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
|
|
"\n",
|
|
"HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",
|
|
"\n",
|
|
"llm = HuggingFaceInferenceAPI(\n",
|
|
" token=HF_API_KEY,\n",
|
|
" model_name=\"mistralai/Mistral-7B-Instruct-v0.3\",\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## RAG"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Response</span><span style=\"font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">response</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'The 80K pages were human annotated.'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">source_nodes</span>=<span style=\"font-weight: bold\">[</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">NodeWithScore</span><span style=\"font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextNode</span><span style=\"font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'431c2d9a-fc55-40ad-a635-d12aeb6b1cec'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[74]'</span><span style=\"font-weight: bold\">}</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{<</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">NodeRelationship.SOURCE:</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"font-weight: bold\">></span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"font-weight: bold\">(</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">)</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span><span style=\"font-weight: bold\">}</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'4 ANNOTATION CAMPAIGN\\nThe complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"font-weight: bold\">)</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.7986579795165596</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'431c2d9a-fc55-40ad-a635-d12aeb6b1cec'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[74]'</span><span style=\"font-weight: bold\">}</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">}</span>\n",
|
|
"<span style=\"font-weight: bold\">)</span>\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[1;35mResponse\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'The 80K pages were human annotated.'\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33msource_nodes\u001b[0m=\u001b[1m[\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'431c2d9a-fc55-40ad-a635-d12aeb6b1cec'\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m74\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'path'\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'path'\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m<\u001b[0m\u001b[1;95mNodeRelationship.SOURCE:\u001b[0m\u001b[39m \u001b[0m\u001b[32m'1'\u001b[0m\u001b[1m>\u001b[0m: \u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[33m...\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m2\u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'4 ANNOTATION CAMPAIGN\\nThe complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.'\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.7986579795165596\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'431c2d9a-fc55-40ad-a635-d12aeb6b1cec'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m74\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[1m}\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m\n",
|
|
"\u001b[1m)\u001b[0m\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from llama_index.core import PromptTemplate\n",
|
|
"\n",
|
|
"TEXT_QA_TEMPLATE_STR = \"Context information is below.\\n---------------------\\n{context_str}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {query_str}\\nAnswer:\\n\"\n",
|
|
"\n",
|
|
"query_engine = index.as_query_engine(\n",
|
|
" llm=llm,\n",
|
|
" text_qa_template=PromptTemplate(TEXT_QA_TEMPLATE_STR),\n",
|
|
")\n",
|
|
"query_res = query_engine.query(\"How many pages were human annotated?\")\n",
|
|
"pprint(query_res, max_length=1, max_string=250, max_depth=5)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Fetching Docling native layout info\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import jsonpath_ng\n",
|
|
"from docling_core.types import BaseText, Document, Ref\n",
|
|
"\n",
|
|
"NativeNode = BaseText | Ref\n",
|
|
"\n",
|
|
"\n",
|
|
"def get_native_node(dl_doc: Document, path: str) -> NativeNode:\n",
|
|
" jsonpath_expr = jsonpath_ng.parse(path)\n",
|
|
" jsonpath_res = [match.value for match in jsonpath_expr.find(dl_doc.model_dump())]\n",
|
|
" if (num_res := len(jsonpath_res)) == 0:\n",
|
|
" raise RuntimeError(f\"No results found for {path}\")\n",
|
|
" elif num_res > 1:\n",
|
|
" # currently only single result supported\n",
|
|
" raise RuntimeError(f\"Multiple results found for {path}\")\n",
|
|
" jres = jsonpath_res[0]\n",
|
|
" return TypeAdapter(NativeNode).validate_python(jres)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #00ff00; text-decoration-color: #00ff00\">────────────────────────────────── </span><span style=\"color: #808000; text-decoration-color: #808000\">dl_doc_hash</span>=<span style=\"color: #800080; text-decoration-color: #800080\">5dfbd8c</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> <span style=\"color: #808000; text-decoration-color: #808000\">path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[74]'</span><span style=\"color: #00ff00; text-decoration-color: #00ff00\"> ──────────────────────────────────</span>\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[92m────────────────────────────────── \u001b[0m\u001b[33mdl_doc_hash\u001b[0m=\u001b[35m5dfbd8c\u001b[0m\u001b[33m...\u001b[0m \u001b[33mpath\u001b[0m=\u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m74\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[92m ──────────────────────────────────\u001b[0m\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">BaseText</span><span style=\"font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">obj_type</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'paragraph'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">name</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Text'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">font</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">prov</span>=<span style=\"font-weight: bold\">[</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Prov</span><span style=\"font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">bbox</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">52.994422912597656</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">217.798828125</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">295.5625305175781</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">259.6097106933594</span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">page</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">span</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">221</span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">ref_s3_data</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">]</span>\n",
|
|
"<span style=\"font-weight: bold\">)</span>\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[1;35mBaseText\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.'\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mobj_type\u001b[0m=\u001b[32m'paragraph'\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mname\u001b[0m=\u001b[32m'Text'\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mfont\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mprov\u001b[0m=\u001b[1m[\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[1;35mProv\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mbbox\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;36m52.994422912597656\u001b[0m, \u001b[1;36m217.798828125\u001b[0m, \u001b[1;36m295.5625305175781\u001b[0m, \u001b[1;36m259.6097106933594\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mpage\u001b[0m=\u001b[1;36m5\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mspan\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;36m0\u001b[0m, \u001b[1;36m221\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mref_s3_data\u001b[0m=\u001b[3;35mNone\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m\n",
|
|
"\u001b[1m)\u001b[0m\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #00ff00; text-decoration-color: #00ff00\">────────────────────────────────── </span><span style=\"color: #808000; text-decoration-color: #808000\">dl_doc_hash</span>=<span style=\"color: #800080; text-decoration-color: #800080\">5dfbd8c</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> <span style=\"color: #808000; text-decoration-color: #808000\">path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[79]'</span><span style=\"color: #00ff00; text-decoration-color: #00ff00\"> ──────────────────────────────────</span>\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[92m────────────────────────────────── \u001b[0m\u001b[33mdl_doc_hash\u001b[0m=\u001b[35m5dfbd8c\u001b[0m\u001b[33m...\u001b[0m \u001b[33mpath\u001b[0m=\u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m79\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[92m ──────────────────────────────────\u001b[0m\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">BaseText</span><span style=\"font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">\"Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted\"</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">obj_type</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'paragraph'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">name</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Text'</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">font</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">prov</span>=<span style=\"font-weight: bold\">[</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Prov</span><span style=\"font-weight: bold\">(</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">bbox</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">317.00592041015625</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">82.7375717163086</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">559.7149047851562</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">245.28392028808594</span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">page</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">span</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">987</span><span style=\"font-weight: bold\">]</span>,\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">ref_s3_data</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>\n",
|
|
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">]</span>\n",
|
|
"<span style=\"font-weight: bold\">)</span>\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[1;35mBaseText\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m\"Phase\u001b[0m\u001b[32m 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement \u001b[0m\u001b[32m(\u001b[0m\u001b[32msee Table 1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. We wanted\"\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mobj_type\u001b[0m=\u001b[32m'paragraph'\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mname\u001b[0m=\u001b[32m'Text'\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mfont\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[33mprov\u001b[0m=\u001b[1m[\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[1;35mProv\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mbbox\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;36m317.00592041015625\u001b[0m, \u001b[1;36m82.7375717163086\u001b[0m, \u001b[1;36m559.7149047851562\u001b[0m, \u001b[1;36m245.28392028808594\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mpage\u001b[0m=\u001b[1;36m5\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mspan\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;36m0\u001b[0m, \u001b[1;36m987\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mref_s3_data\u001b[0m=\u001b[3;35mNone\u001b[0m\n",
|
|
"\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n",
|
|
"\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m\n",
|
|
"\u001b[1m)\u001b[0m\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from docling_core.types import Document as DLDocument\n",
|
|
"\n",
|
|
"dl_docs = [DLDocument.model_validate_json(doc.text) for doc in docs]\n",
|
|
"for retr_item in query_res.source_nodes:\n",
|
|
" source_metadata = DocumentMetadata.model_validate(\n",
|
|
" retr_item.node.source_node.metadata\n",
|
|
" )\n",
|
|
" dl_doc_hash = source_metadata.dl_doc_hash\n",
|
|
" dl_doc = [d for d in dl_docs if d.file_info.document_hash == dl_doc_hash][0]\n",
|
|
" path = NodeMetadata.model_validate(retr_item.node.metadata).path\n",
|
|
" native_node = get_native_node(dl_doc=dl_doc, path=path)\n",
|
|
" console.rule(f\"dl_doc_hash={dl_doc_hash[:7]}...\\n{path=}\")\n",
|
|
" pprint(native_node)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|