{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RAG with Docling and 🦙 LlamaIndex" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# requirements for this example:\n", "%pip install -qq docling docling-core python-dotenv llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "from tempfile import TemporaryDirectory\n", "\n", "from dotenv import load_dotenv\n", "from pydantic import TypeAdapter\n", "from rich.pretty import pprint\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "\n", "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n", "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Helpers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below we define:\n", "\n", "- `DoclingPDFReader` which will be used to create LlamaIndex documents, and\n", "- `HierarchicalJSONNodeParser`, which can be used to create LlamaIndex nodes out of JSON-based documents\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from enum import Enum\n", "from pathlib import Path\n", "from typing import Any, Iterable\n", "\n", "from llama_index.core.readers.base import BasePydanticReader\n", "from llama_index.core.schema import Document as LIDocument\n", "\n", "from docling.document_converter import DocumentConverter\n", "\n", "class DocMetaKeys(str, Enum):\n", " DL_DOC_HASH = \"dl_doc_hash\"\n", " ORIGIN = \"origin\"\n", "\n", "class DoclingPDFReader(BasePydanticReader):\n", " class ParseType(str, Enum):\n", " MARKDOWN = \"markdown\"\n", " JSON = \"json\"\n", "\n", " parse_type: ParseType = ParseType.MARKDOWN\n", "\n", " def lazy_load_data(\n", " self,\n", " file_path: str | Path | Iterable[str] | Iterable[Path],\n", " *args: Any,\n", " **load_kwargs: Any,\n", " ) -> Iterable[LIDocument]:\n", " file_paths = (\n", " file_path\n", " if isinstance(file_path, Iterable) and not isinstance(file_path, str)\n", " else [file_path]\n", " )\n", " converter = DocumentConverter()\n", " for source in file_paths:\n", " dl_doc = converter.convert_single(source).output\n", " match self.parse_type:\n", " case self.ParseType.MARKDOWN:\n", " text = dl_doc.export_to_markdown()\n", " case self.ParseType.JSON:\n", " text = dl_doc.model_dump_json()\n", " case _:\n", " raise RuntimeError(\n", " f\"Unexpected export type encountered: {self.export_type}\"\n", " )\n", " origin = str(source) if isinstance(source, Path) else source\n", " li_doc = LIDocument(text=text)\n", " li_doc.metadata = {\n", " DocMetaKeys.DL_DOC_HASH: dl_doc.file_info.document_hash,\n", " DocMetaKeys.ORIGIN: origin,\n", " }\n", " yield li_doc" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from typing import Any, Iterable, Sequence\n", "\n", "from docling_core.transforms.chunker import ChunkWithMetadata, HierarchicalChunker\n", "from docling_core.types import Document as DLDocument\n", "from llama_index.core import Document as LIDocument\n", "from llama_index.core.node_parser.interface import NodeParser\n", "from llama_index.core.schema import (\n", " BaseNode,\n", " NodeRelationship,\n", " RelatedNodeType,\n", " TextNode,\n", ")\n", "from llama_index.core.utils import get_tqdm_iterable\n", "\n", "\n", "class NodeMetaKeys(str, Enum):\n", " PATH = \"path\"\n", " PAGE = \"page\"\n", " BBOX = \"bbox\"\n", " ORIGIN = \"origin\"\n", "\n", "\n", "class HierarchicalJSONNodeParser(NodeParser):\n", "\n", " def _parse_nodes(\n", " self,\n", " nodes: Sequence[BaseNode],\n", " show_progress: bool = False,\n", " **kwargs: Any,\n", " ) -> list[BaseNode]:\n", " nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(\n", " items=nodes, show_progress=show_progress, desc=\"Parsing nodes\"\n", " )\n", " all_nodes: list[BaseNode] = []\n", " chunker = HierarchicalChunker()\n", " for input_node in nodes_with_progress:\n", " li_doc = LIDocument.model_validate(input_node)\n", " dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())\n", " chunk_iter = chunker.chunk(dl_doc=dl_doc)\n", " for chunk in chunk_iter:\n", " rels: dict[NodeRelationship, RelatedNodeType] = {\n", " NodeRelationship.SOURCE: li_doc.as_related_node_info(),\n", " }\n", " excl_doc_meta_keys = [d.value for d in DocMetaKeys]\n", " excl_node_meta_keys = [n.value for n in NodeMetaKeys]\n", " excl_meta_keys = excl_doc_meta_keys + excl_node_meta_keys\n", " node = TextNode(\n", " text=chunk.text,\n", " excluded_embed_metadata_keys=excl_meta_keys,\n", " excluded_llm_metadata_keys=excl_meta_keys,\n", " relationships=rels,\n", " )\n", " node.metadata = {NodeMetaKeys.PATH: chunk.path}\n", " if isinstance(chunk, ChunkWithMetadata):\n", " node.metadata[NodeMetaKeys.PAGE] = chunk.page\n", " node.metadata[NodeMetaKeys.BBOX] = chunk.bbox\n", " all_nodes.append(node)\n", " return all_nodes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reader and node parser" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using JSON" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To leverage Docling's rich document structure format, we can namely export to JSON and use the HierarchicalJSONNodeParser accordingly:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)\n", "node_parser = HierarchicalJSONNodeParser()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using Markdown" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# from llama_index.core.node_parser import MarkdownNodeParser\n", "\n", "# reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n", "# node_parser = MarkdownNodeParser()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transformations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our transformations currently include the `node_parser`:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "transformations = [node_parser]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "One can include add more transformations, e.g. further chunking based on text size / overlap, as shown below:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# from llama_index.core.node_parser import TokenTextSplitter\n", "\n", "# splitter = TokenTextSplitter(\n", "# chunk_size=1024,\n", "# chunk_overlap=0,\n", "# )\n", "# transformations.append(splitter)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Embed model" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", "\n", "embed_model = HuggingFaceEmbedding(model_name=\"intfloat/multilingual-e5-small\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Vector store" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "INGEST = True # whether to ingest from scratch or reuse an existing vector store" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "from llama_index.vector_stores.milvus import MilvusVectorStore\n", "\n", "MILVUS_URL = os.environ.get(\n", " \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n", ")\n", "MILVUS_COLL_NAME = os.environ.get(\"MILVUS_COLL_NAME\", \"basic_llamaindex_pipeline\")\n", "MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get(\"MILVUS_KWARGS\", \"{}\"))\n", "\n", "vector_store = MilvusVectorStore(\n", " uri=MILVUS_URL,\n", " collection_name=MILVUS_COLL_NAME,\n", " dim=len(embed_model.get_text_embedding(\"hi\")),\n", " overwrite=INGEST,\n", " **MILVUS_KWARGS,\n", ")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[\n", "│ Document(\n", "│ │ id_='83f7b6f1-33e3-493f-8240-95662a93d4dc',\n", "│ │ embedding=None,\n", "│ │ metadata={'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'+14, ... +1},\n", "│ │ excluded_embed_metadata_keys=[],\n", "│ │ excluded_llm_metadata_keys=[],\n", "│ │ relationships={},\n", "│ │ text='{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"'+173793,\n", "│ │ mimetype='text/plain',\n", "│ │ start_char_idx=None,\n", "│ │ end_char_idx=None,\n", "│ │ text_template='{metadata_str}\\n\\n{content}',\n", "│ │ metadata_template='{key}: {value}',\n", "│ │ metadata_seperator='\\n'\n", "│ )\n", "]\n", "\n" ], "text/plain": [ "\u001b[1m[\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1;35mDocument\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"_name\":\"\",\"type\":\"pdf-document\",\"description\":\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"'\u001b[0m+\u001b[1;36m173793\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1m)\u001b[0m\n", "\u001b[1m]\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from llama_index.core import StorageContext, VectorStoreIndex\n", "\n", "if INGEST:\n", " # in this case we ingest the data into the vector store\n", " docs = reader.load_data(\n", " file_path=\"https://arxiv.org/pdf/2206.01062\", # DocLayNet paper\n", " )\n", " pprint(docs, max_length=1, max_string=50, max_depth=4)\n", " storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", " index = VectorStoreIndex.from_documents(\n", " documents=docs,\n", " embed_model=embed_model,\n", " storage_context=storage_context,\n", " transformations=transformations,\n", " )\n", "else:\n", " # in this case we just load the vector store index\n", " index = VectorStoreIndex.from_vector_store(\n", " vector_store=vector_store,\n", " embed_model=embed_model,\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### LLM" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n", "\n", "llm = HuggingFaceInferenceAPI(\n", " token=HF_API_KEY,\n", " model_name=\"mistralai/Mistral-7B-Instruct-v0.3\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RAG" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Response(\n", "│ response='80863 pages were annotated by humans.',\n", "│ source_nodes=[\n", "│ │ NodeWithScore(\n", "│ │ │ node=TextNode(\n", "│ │ │ │ id_='79ee790b-73d8-4268-90d7-301b5cd5e8f4',\n", "│ │ │ │ embedding=None,\n", "│ │ │ │ metadata={\n", "│ │ │ │ │ 'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n", "│ │ │ │ │ 'origin': 'https://arxiv.org/pdf/2206.01062',\n", "│ │ │ │ │ 'path': '$.main-text[36]',\n", "│ │ │ │ │ 'page': 2,\n", "│ │ │ │ │ 'bbox': [317.11236572265625, 116.19312286376953, 559.7131958007812, 202.27523803710938]\n", "│ │ │ │ },\n", "│ │ │ │ excluded_embed_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n", "│ │ │ │ excluded_llm_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n", "│ │ │ │ relationships={\n", "│ │ │ │ │ <NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(\n", "│ │ │ │ │ │ node_id='83f7b6f1-33e3-493f-8240-95662a93d4dc',\n", "│ │ │ │ │ │ node_type=<ObjectType.DOCUMENT: '4'>,\n", "│ │ │ │ │ │ metadata={...},\n", "│ │ │ │ │ │ hash='10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\n", "│ │ │ │ │ ),\n", "│ │ │ │ │ <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(\n", "│ │ │ │ │ │ node_id='5509c0ef-2890-4bba-aa0f-82c0c389a621',\n", "│ │ │ │ │ │ node_type=<ObjectType.TEXT: '1'>,\n", "│ │ │ │ │ │ metadata={...},\n", "│ │ │ │ │ │ hash='d2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'\n", "│ │ │ │ │ ),\n", "│ │ │ │ │ <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(\n", "│ │ │ │ │ │ node_id='47f51f1f-e92f-4d82-b36e-466fa62f8e34',\n", "│ │ │ │ │ │ node_type=<ObjectType.TEXT: '1'>,\n", "│ │ │ │ │ │ metadata={...},\n", "│ │ │ │ │ │ hash='df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'\n", "│ │ │ │ │ )\n", "│ │ │ │ },\n", "│ │ │ │ text='3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'+296,\n", "│ │ │ │ mimetype='text/plain',\n", "│ │ │ │ start_char_idx=None,\n", "│ │ │ │ end_char_idx=None,\n", "│ │ │ │ text_template='{metadata_str}\\n\\n{content}',\n", "│ │ │ │ metadata_template='{key}: {value}',\n", "│ │ │ │ metadata_seperator='\\n'\n", "│ │ │ ),\n", "│ │ │ score=0.8344892859458923\n", "│ │ ),\n", "│ │ NodeWithScore(\n", "│ │ │ node=TextNode(\n", "│ │ │ │ id_='e1585b75-17f1-42b1-882a-f44e6ae4d382',\n", "│ │ │ │ embedding=None,\n", "│ │ │ │ metadata={\n", "│ │ │ │ │ 'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n", "│ │ │ │ │ 'origin': 'https://arxiv.org/pdf/2206.01062',\n", "│ │ │ │ │ 'path': '$.main-text[75]',\n", "│ │ │ │ │ 'page': 5,\n", "│ │ │ │ │ 'bbox': [53.26631546020508, 86.24749755859375, 295.562255859375, 215.95584106445312]\n", "│ │ │ │ },\n", "│ │ │ │ excluded_embed_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n", "│ │ │ │ excluded_llm_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n", "│ │ │ │ relationships={\n", "│ │ │ │ │ <NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(\n", "│ │ │ │ │ │ node_id='83f7b6f1-33e3-493f-8240-95662a93d4dc',\n", "│ │ │ │ │ │ node_type=<ObjectType.DOCUMENT: '4'>,\n", "│ │ │ │ │ │ metadata={...},\n", "│ │ │ │ │ │ hash='10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\n", "│ │ │ │ │ ),\n", "│ │ │ │ │ <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(\n", "│ │ │ │ │ │ node_id='964511c8-a412-47c4-8a3d-e4bf92edbda4',\n", "│ │ │ │ │ │ node_type=<ObjectType.TEXT: '1'>,\n", "│ │ │ │ │ │ metadata={...},\n", "│ │ │ │ │ │ hash='c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'\n", "│ │ │ │ │ ),\n", "│ │ │ │ │ <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(\n", "│ │ │ │ │ │ node_id='9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd',\n", "│ │ │ │ │ │ node_type=<ObjectType.TEXT: '1'>,\n", "│ │ │ │ │ │ metadata={...},\n", "│ │ │ │ │ │ hash='09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'\n", "│ │ │ │ │ )\n", "│ │ │ │ },\n", "│ │ │ │ text='4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'+564,\n", "│ │ │ │ mimetype='text/plain',\n", "│ │ │ │ start_char_idx=None,\n", "│ │ │ │ end_char_idx=None,\n", "│ │ │ │ text_template='{metadata_str}\\n\\n{content}',\n", "│ │ │ │ metadata_template='{key}: {value}',\n", "│ │ │ │ metadata_seperator='\\n'\n", "│ │ │ ),\n", "│ │ │ score=0.8309065699577332\n", "│ │ )\n", "│ ],\n", "│ metadata={\n", "│ │ '79ee790b-73d8-4268-90d7-301b5cd5e8f4': {\n", "│ │ │ 'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n", "│ │ │ 'origin': 'https://arxiv.org/pdf/2206.01062',\n", "│ │ │ 'path': '$.main-text[36]',\n", "│ │ │ 'page': 2,\n", "│ │ │ 'bbox': [317.11236572265625, 116.19312286376953, 559.7131958007812, 202.27523803710938]\n", "│ │ },\n", "│ │ 'e1585b75-17f1-42b1-882a-f44e6ae4d382': {\n", "│ │ │ 'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n", "│ │ │ 'origin': 'https://arxiv.org/pdf/2206.01062',\n", "│ │ │ 'path': '$.main-text[75]',\n", "│ │ │ 'page': 5,\n", "│ │ │ 'bbox': [53.26631546020508, 86.24749755859375, 295.562255859375, 215.95584106445312]\n", "│ │ }\n", "│ }\n", ")\n", "\n" ], "text/plain": [ "\u001b[1;35mResponse\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80863 pages were annotated by humans.'\u001b[0m,\n", "\u001b[2;32m│ \u001b[0m\u001b[33msource_nodes\u001b[0m=\u001b[1m[\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m<\u001b[0m\u001b[1;95mNodeRelationship.SOURCE:\u001b[0m\u001b[39m \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=