{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RAG with Docling and 🦙 LlamaIndex" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# requirements for this example:\n", "%pip install -qq docling docling-core python-dotenv llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "from tempfile import TemporaryDirectory\n", "\n", "from dotenv import load_dotenv\n", "from pydantic import TypeAdapter\n", "from rich.pretty import pprint\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "\n", "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n", "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Helpers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below we define:\n", "\n", "- `DoclingPDFReader` which will be used to create LlamaIndex documents, and\n", "- `HierarchicalJSONNodeParser`, which can be used to create LlamaIndex nodes out of JSON-based documents\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from enum import Enum\n", "from pathlib import Path\n", "from typing import Any, Iterable\n", "\n", "from llama_index.core.readers.base import BasePydanticReader\n", "from llama_index.core.schema import Document as LIDocument\n", "\n", "from docling.document_converter import DocumentConverter\n", "\n", "class DocMetaKeys(str, Enum):\n", " DL_DOC_HASH = \"dl_doc_hash\"\n", " ORIGIN = \"origin\"\n", "\n", "class DoclingPDFReader(BasePydanticReader):\n", " class ParseType(str, Enum):\n", " MARKDOWN = \"markdown\"\n", " JSON = \"json\"\n", "\n", " parse_type: ParseType = ParseType.MARKDOWN\n", "\n", " def lazy_load_data(\n", " self,\n", " file_path: str | Path | Iterable[str] | Iterable[Path],\n", " *args: Any,\n", " **load_kwargs: Any,\n", " ) -> Iterable[LIDocument]:\n", " file_paths = (\n", " file_path\n", " if isinstance(file_path, Iterable) and not isinstance(file_path, str)\n", " else [file_path]\n", " )\n", " converter = DocumentConverter()\n", " for source in file_paths:\n", " dl_doc = converter.convert_single(source).output\n", " match self.parse_type:\n", " case self.ParseType.MARKDOWN:\n", " text = dl_doc.export_to_markdown()\n", " case self.ParseType.JSON:\n", " text = dl_doc.model_dump_json()\n", " case _:\n", " raise RuntimeError(\n", " f\"Unexpected export type encountered: {self.export_type}\"\n", " )\n", " origin = str(source) if isinstance(source, Path) else source\n", " li_doc = LIDocument(text=text)\n", " li_doc.metadata = {\n", " DocMetaKeys.DL_DOC_HASH: dl_doc.file_info.document_hash,\n", " DocMetaKeys.ORIGIN: origin,\n", " }\n", " yield li_doc" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from typing import Any, Iterable, Sequence\n", "\n", "from docling_core.transforms.chunker import ChunkWithMetadata, HierarchicalChunker\n", "from docling_core.types import Document as DLDocument\n", "from llama_index.core import Document as LIDocument\n", "from llama_index.core.node_parser.interface import NodeParser\n", "from llama_index.core.schema import (\n", " BaseNode,\n", " NodeRelationship,\n", " RelatedNodeType,\n", " TextNode,\n", ")\n", "from llama_index.core.utils import get_tqdm_iterable\n", "\n", "\n", "class NodeMetaKeys(str, Enum):\n", " PATH = \"path\"\n", " PAGE = \"page\"\n", " BBOX = \"bbox\"\n", " ORIGIN = \"origin\"\n", "\n", "\n", "class HierarchicalJSONNodeParser(NodeParser):\n", "\n", " def _parse_nodes(\n", " self,\n", " nodes: Sequence[BaseNode],\n", " show_progress: bool = False,\n", " **kwargs: Any,\n", " ) -> list[BaseNode]:\n", " nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(\n", " items=nodes, show_progress=show_progress, desc=\"Parsing nodes\"\n", " )\n", " all_nodes: list[BaseNode] = []\n", " chunker = HierarchicalChunker()\n", " for input_node in nodes_with_progress:\n", " li_doc = LIDocument.model_validate(input_node)\n", " dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())\n", " chunk_iter = chunker.chunk(dl_doc=dl_doc)\n", " for chunk in chunk_iter:\n", " rels: dict[NodeRelationship, RelatedNodeType] = {\n", " NodeRelationship.SOURCE: li_doc.as_related_node_info(),\n", " }\n", " excl_doc_meta_keys = [d.value for d in DocMetaKeys]\n", " excl_node_meta_keys = [n.value for n in NodeMetaKeys]\n", " excl_meta_keys = excl_doc_meta_keys + excl_node_meta_keys\n", " node = TextNode(\n", " text=chunk.text,\n", " excluded_embed_metadata_keys=excl_meta_keys,\n", " excluded_llm_metadata_keys=excl_meta_keys,\n", " relationships=rels,\n", " )\n", " node.metadata = {NodeMetaKeys.PATH: chunk.path}\n", " if isinstance(chunk, ChunkWithMetadata):\n", " node.metadata[NodeMetaKeys.PAGE] = chunk.page\n", " node.metadata[NodeMetaKeys.BBOX] = chunk.bbox\n", " all_nodes.append(node)\n", " return all_nodes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reader and node parser" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using JSON" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To leverage Docling's rich document structure format, we can namely export to JSON and use the HierarchicalJSONNodeParser accordingly:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)\n", "node_parser = HierarchicalJSONNodeParser()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using Markdown" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# from llama_index.core.node_parser import MarkdownNodeParser\n", "\n", "# reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n", "# node_parser = MarkdownNodeParser()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transformations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our transformations currently include the `node_parser`:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "transformations = [node_parser]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "One can include add more transformations, e.g. further chunking based on text size / overlap, as shown below:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# from llama_index.core.node_parser import TokenTextSplitter\n", "\n", "# splitter = TokenTextSplitter(\n", "# chunk_size=1024,\n", "# chunk_overlap=0,\n", "# )\n", "# transformations.append(splitter)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Embed model" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", "\n", "embed_model = HuggingFaceEmbedding(model_name=\"intfloat/multilingual-e5-small\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Vector store" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "INGEST = True # whether to ingest from scratch or reuse an existing vector store" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "from llama_index.vector_stores.milvus import MilvusVectorStore\n", "\n", "MILVUS_URL = os.environ.get(\n", " \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n", ")\n", "MILVUS_COLL_NAME = os.environ.get(\"MILVUS_COLL_NAME\", \"basic_llamaindex_pipeline\")\n", "MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get(\"MILVUS_KWARGS\", \"{}\"))\n", "\n", "vector_store = MilvusVectorStore(\n", " uri=MILVUS_URL,\n", " collection_name=MILVUS_COLL_NAME,\n", " dim=len(embed_model.get_text_embedding(\"hi\")),\n", " overwrite=INGEST,\n", " **MILVUS_KWARGS,\n", ")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[\n",
       "│   Document(\n",
       "│   │   id_='83f7b6f1-33e3-493f-8240-95662a93d4dc',\n",
       "│   │   embedding=None,\n",
       "│   │   metadata={'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'+14, ... +1},\n",
       "│   │   excluded_embed_metadata_keys=[],\n",
       "│   │   excluded_llm_metadata_keys=[],\n",
       "│   │   relationships={},\n",
       "│   │   text='{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"'+173793,\n",
       "│   │   mimetype='text/plain',\n",
       "│   │   start_char_idx=None,\n",
       "│   │   end_char_idx=None,\n",
       "│   │   text_template='{metadata_str}\\n\\n{content}',\n",
       "│   │   metadata_template='{key}: {value}',\n",
       "│   │   metadata_seperator='\\n'\n",
       "│   )\n",
       "]\n",
       "
\n" ], "text/plain": [ "\u001b[1m[\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1;35mDocument\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"_name\":\"\",\"type\":\"pdf-document\",\"description\":\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"'\u001b[0m+\u001b[1;36m173793\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1m)\u001b[0m\n", "\u001b[1m]\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from llama_index.core import StorageContext, VectorStoreIndex\n", "\n", "if INGEST:\n", " # in this case we ingest the data into the vector store\n", " docs = reader.load_data(\n", " file_path=\"https://arxiv.org/pdf/2206.01062\", # DocLayNet paper\n", " )\n", " pprint(docs, max_length=1, max_string=50, max_depth=4)\n", " storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", " index = VectorStoreIndex.from_documents(\n", " documents=docs,\n", " embed_model=embed_model,\n", " storage_context=storage_context,\n", " transformations=transformations,\n", " )\n", "else:\n", " # in this case we just load the vector store index\n", " index = VectorStoreIndex.from_vector_store(\n", " vector_store=vector_store,\n", " embed_model=embed_model,\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### LLM" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n", "\n", "llm = HuggingFaceInferenceAPI(\n", " token=HF_API_KEY,\n", " model_name=\"mistralai/Mistral-7B-Instruct-v0.3\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RAG" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Response(\n",
       "│   response='80863 pages were annotated by humans.',\n",
       "│   source_nodes=[\n",
       "│   │   NodeWithScore(\n",
       "│   │   │   node=TextNode(\n",
       "│   │   │   │   id_='79ee790b-73d8-4268-90d7-301b5cd5e8f4',\n",
       "│   │   │   │   embedding=None,\n",
       "│   │   │   │   metadata={\n",
       "│   │   │   │   │   'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n",
       "│   │   │   │   │   'origin': 'https://arxiv.org/pdf/2206.01062',\n",
       "│   │   │   │   │   'path': '$.main-text[36]',\n",
       "│   │   │   │   │   'page': 2,\n",
       "│   │   │   │   │   'bbox': [317.11236572265625, 116.19312286376953, 559.7131958007812, 202.27523803710938]\n",
       "│   │   │   │   },\n",
       "│   │   │   │   excluded_embed_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n",
       "│   │   │   │   excluded_llm_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n",
       "│   │   │   │   relationships={\n",
       "│   │   │   │   │   <NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(\n",
       "│   │   │   │   │   │   node_id='83f7b6f1-33e3-493f-8240-95662a93d4dc',\n",
       "│   │   │   │   │   │   node_type=<ObjectType.DOCUMENT: '4'>,\n",
       "│   │   │   │   │   │   metadata={...},\n",
       "│   │   │   │   │   │   hash='10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\n",
       "│   │   │   │   │   ),\n",
       "│   │   │   │   │   <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(\n",
       "│   │   │   │   │   │   node_id='5509c0ef-2890-4bba-aa0f-82c0c389a621',\n",
       "│   │   │   │   │   │   node_type=<ObjectType.TEXT: '1'>,\n",
       "│   │   │   │   │   │   metadata={...},\n",
       "│   │   │   │   │   │   hash='d2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'\n",
       "│   │   │   │   │   ),\n",
       "│   │   │   │   │   <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(\n",
       "│   │   │   │   │   │   node_id='47f51f1f-e92f-4d82-b36e-466fa62f8e34',\n",
       "│   │   │   │   │   │   node_type=<ObjectType.TEXT: '1'>,\n",
       "│   │   │   │   │   │   metadata={...},\n",
       "│   │   │   │   │   │   hash='df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'\n",
       "│   │   │   │   │   )\n",
       "│   │   │   │   },\n",
       "│   │   │   │   text='3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'+296,\n",
       "│   │   │   │   mimetype='text/plain',\n",
       "│   │   │   │   start_char_idx=None,\n",
       "│   │   │   │   end_char_idx=None,\n",
       "│   │   │   │   text_template='{metadata_str}\\n\\n{content}',\n",
       "│   │   │   │   metadata_template='{key}: {value}',\n",
       "│   │   │   │   metadata_seperator='\\n'\n",
       "│   │   │   ),\n",
       "│   │   │   score=0.8344892859458923\n",
       "│   │   ),\n",
       "│   │   NodeWithScore(\n",
       "│   │   │   node=TextNode(\n",
       "│   │   │   │   id_='e1585b75-17f1-42b1-882a-f44e6ae4d382',\n",
       "│   │   │   │   embedding=None,\n",
       "│   │   │   │   metadata={\n",
       "│   │   │   │   │   'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n",
       "│   │   │   │   │   'origin': 'https://arxiv.org/pdf/2206.01062',\n",
       "│   │   │   │   │   'path': '$.main-text[75]',\n",
       "│   │   │   │   │   'page': 5,\n",
       "│   │   │   │   │   'bbox': [53.26631546020508, 86.24749755859375, 295.562255859375, 215.95584106445312]\n",
       "│   │   │   │   },\n",
       "│   │   │   │   excluded_embed_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n",
       "│   │   │   │   excluded_llm_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n",
       "│   │   │   │   relationships={\n",
       "│   │   │   │   │   <NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(\n",
       "│   │   │   │   │   │   node_id='83f7b6f1-33e3-493f-8240-95662a93d4dc',\n",
       "│   │   │   │   │   │   node_type=<ObjectType.DOCUMENT: '4'>,\n",
       "│   │   │   │   │   │   metadata={...},\n",
       "│   │   │   │   │   │   hash='10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\n",
       "│   │   │   │   │   ),\n",
       "│   │   │   │   │   <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(\n",
       "│   │   │   │   │   │   node_id='964511c8-a412-47c4-8a3d-e4bf92edbda4',\n",
       "│   │   │   │   │   │   node_type=<ObjectType.TEXT: '1'>,\n",
       "│   │   │   │   │   │   metadata={...},\n",
       "│   │   │   │   │   │   hash='c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'\n",
       "│   │   │   │   │   ),\n",
       "│   │   │   │   │   <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(\n",
       "│   │   │   │   │   │   node_id='9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd',\n",
       "│   │   │   │   │   │   node_type=<ObjectType.TEXT: '1'>,\n",
       "│   │   │   │   │   │   metadata={...},\n",
       "│   │   │   │   │   │   hash='09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'\n",
       "│   │   │   │   │   )\n",
       "│   │   │   │   },\n",
       "│   │   │   │   text='4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'+564,\n",
       "│   │   │   │   mimetype='text/plain',\n",
       "│   │   │   │   start_char_idx=None,\n",
       "│   │   │   │   end_char_idx=None,\n",
       "│   │   │   │   text_template='{metadata_str}\\n\\n{content}',\n",
       "│   │   │   │   metadata_template='{key}: {value}',\n",
       "│   │   │   │   metadata_seperator='\\n'\n",
       "│   │   │   ),\n",
       "│   │   │   score=0.8309065699577332\n",
       "│   │   )\n",
       "│   ],\n",
       "│   metadata={\n",
       "│   │   '79ee790b-73d8-4268-90d7-301b5cd5e8f4': {\n",
       "│   │   │   'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n",
       "│   │   │   'origin': 'https://arxiv.org/pdf/2206.01062',\n",
       "│   │   │   'path': '$.main-text[36]',\n",
       "│   │   │   'page': 2,\n",
       "│   │   │   'bbox': [317.11236572265625, 116.19312286376953, 559.7131958007812, 202.27523803710938]\n",
       "│   │   },\n",
       "│   │   'e1585b75-17f1-42b1-882a-f44e6ae4d382': {\n",
       "│   │   │   'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n",
       "│   │   │   'origin': 'https://arxiv.org/pdf/2206.01062',\n",
       "│   │   │   'path': '$.main-text[75]',\n",
       "│   │   │   'page': 5,\n",
       "│   │   │   'bbox': [53.26631546020508, 86.24749755859375, 295.562255859375, 215.95584106445312]\n",
       "│   │   }\n",
       "│   }\n",
       ")\n",
       "
\n" ], "text/plain": [ "\u001b[1;35mResponse\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80863 pages were annotated by humans.'\u001b[0m,\n", "\u001b[2;32m│ \u001b[0m\u001b[33msource_nodes\u001b[0m=\u001b[1m[\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m<\u001b[0m\u001b[1;95mNodeRelationship.SOURCE:\u001b[0m\u001b[39m \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'5509c0ef-2890-4bba-aa0f-82c0c389a621'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'd2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'47f51f1f-e92f-4d82-b36e-466fa62f8e34'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'\u001b[0m\u001b[39m+\u001b[0m\u001b[1;36m296\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'text/plain'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\\n'\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m\u001b[39m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;36m.8344892859458923\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1;39m(\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m\u001b[39m=\u001b[0m\u001b[1;35mTextNode\u001b[0m\u001b[1;39m(\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'e1585b75-17f1-42b1-882a-f44e6ae4d382'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m5\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m[\u001b[0m\u001b[1;36m53.26631546020508\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m86.24749755859375\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m295.562255859375\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m215.95584106445312\u001b[0m\u001b[1;39m]\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m, \u001b[0m\u001b[33m...\u001b[0m\u001b[39m +\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m, \u001b[0m\u001b[33m...\u001b[0m\u001b[39m +\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'964511c8-a412-47c4-8a3d-e4bf92edbda4'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd'\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m=\u001b[32m'09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'\u001b[0m\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m)\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'\u001b[0m+\u001b[1;36m564\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.8309065699577332\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m: \u001b[1m{\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[32m'e1585b75-17f1-42b1-882a-f44e6ae4d382'\u001b[0m: \u001b[1m{\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m5\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m53.26631546020508\u001b[0m, \u001b[1;36m86.24749755859375\u001b[0m, \u001b[1;36m295.562255859375\u001b[0m, \u001b[1;36m215.95584106445312\u001b[0m\u001b[1m]\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n", "\u001b[1m)\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "query_engine = index.as_query_engine(llm=llm)\n", "query_res = query_engine.query(\"How many pages were annotated by humans?\")\n", "pprint(query_res, max_length=5, max_string=250, max_depth=6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }