chore: showcase extended metadata in LlamaIndex example

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-09-27 19:31:43 +02:00
parent 9b82ae3324
commit f4ee76eaec
2 changed files with 374 additions and 97 deletions

View File

@ -75,16 +75,17 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Reader and node parser" "### Helpers"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Below we set up:\n", "Below we define:\n",
"- a `Reader` which will be used to create LlamaIndex documents, and\n", "\n",
"- a `NodeParser`, which will be used to create LlamaIndex nodes out of the documents" "- `DoclingPDFReader` which will be used to create LlamaIndex documents, and\n",
"- `HierarchicalJSONNodeParser`, which can be used to create LlamaIndex nodes out of JSON-based documents\n"
] ]
}, },
{ {
@ -94,50 +95,54 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"from enum import Enum\n", "from enum import Enum\n",
"from typing import Iterable\n", "from pathlib import Path\n",
"from typing import Any, Iterable\n",
"\n", "\n",
"from llama_index.core.readers.base import BasePydanticReader\n", "from llama_index.core.readers.base import BasePydanticReader\n",
"from llama_index.core.schema import Document as LIDocument\n", "from llama_index.core.schema import Document as LIDocument\n",
"from pydantic import BaseModel\n",
"\n", "\n",
"from docling.document_converter import DocumentConverter\n", "from docling.document_converter import DocumentConverter\n",
"\n", "\n",
"\n", "class DocMetaKeys(str, Enum):\n",
"class DocumentMetadata(BaseModel):\n", " DL_DOC_HASH = \"dl_doc_hash\"\n",
" dl_doc_hash: str\n", " ORIGIN = \"origin\"\n",
"\n",
"\n", "\n",
"class DoclingPDFReader(BasePydanticReader):\n", "class DoclingPDFReader(BasePydanticReader):\n",
" class ParseType(str, Enum):\n", " class ParseType(str, Enum):\n",
" MARKDOWN = \"markdown\"\n", " MARKDOWN = \"markdown\"\n",
" # JSON = \"json\"\n", " JSON = \"json\"\n",
"\n", "\n",
" parse_type: ParseType = ParseType.MARKDOWN\n", " parse_type: ParseType = ParseType.MARKDOWN\n",
"\n", "\n",
" def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]:\n", " def lazy_load_data(\n",
" file_paths = file_path if isinstance(file_path, list) else [file_path]\n", " self,\n",
" file_path: str | Path | Iterable[str] | Iterable[Path],\n",
" *args: Any,\n",
" **load_kwargs: Any,\n",
" ) -> Iterable[LIDocument]:\n",
" file_paths = (\n",
" file_path\n",
" if isinstance(file_path, Iterable) and not isinstance(file_path, str)\n",
" else [file_path]\n",
" )\n",
" converter = DocumentConverter()\n", " converter = DocumentConverter()\n",
" for source in file_paths:\n", " for source in file_paths:\n",
" dl_doc = converter.convert_single(source).output\n", " dl_doc = converter.convert_single(source).output\n",
" match self.parse_type:\n", " match self.parse_type:\n",
" case self.ParseType.MARKDOWN:\n", " case self.ParseType.MARKDOWN:\n",
" text = dl_doc.export_to_markdown()\n", " text = dl_doc.export_to_markdown()\n",
" # case self.ParseType.JSON:\n", " case self.ParseType.JSON:\n",
" # text = dl_doc.model_dump_json()\n", " text = dl_doc.model_dump_json()\n",
" case _:\n", " case _:\n",
" raise RuntimeError(\n", " raise RuntimeError(\n",
" f\"Unexpected parse type encountered: {self.parse_type}\"\n", " f\"Unexpected export type encountered: {self.export_type}\"\n",
" )\n", " )\n",
" excl_metadata_keys = [\"dl_doc_hash\"]\n", " origin = str(source) if isinstance(source, Path) else source\n",
" li_doc = LIDocument(\n", " li_doc = LIDocument(text=text)\n",
" doc_id=dl_doc.file_info.document_hash,\n", " li_doc.metadata = {\n",
" text=text,\n", " DocMetaKeys.DL_DOC_HASH: dl_doc.file_info.document_hash,\n",
" excluded_embed_metadata_keys=excl_metadata_keys,\n", " DocMetaKeys.ORIGIN: origin,\n",
" excluded_llm_metadata_keys=excl_metadata_keys,\n", " }\n",
" )\n",
" li_doc.metadata = DocumentMetadata(\n",
" dl_doc_hash=dl_doc.file_info.document_hash,\n",
" ).model_dump()\n",
" yield li_doc" " yield li_doc"
] ]
}, },
@ -147,10 +152,143 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from llama_index.core.node_parser import MarkdownNodeParser\n", "from typing import Any, Iterable, Sequence\n",
"\n", "\n",
"reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n", "from docling_core.transforms.chunker import ChunkWithMetadata, HierarchicalChunker\n",
"node_parser = MarkdownNodeParser()\n", "from docling_core.types import Document as DLDocument\n",
"from llama_index.core import Document as LIDocument\n",
"from llama_index.core.node_parser.interface import NodeParser\n",
"from llama_index.core.schema import (\n",
" BaseNode,\n",
" NodeRelationship,\n",
" RelatedNodeType,\n",
" TextNode,\n",
")\n",
"from llama_index.core.utils import get_tqdm_iterable\n",
"\n",
"\n",
"class NodeMetaKeys(str, Enum):\n",
" PATH = \"path\"\n",
" PAGE = \"page\"\n",
" BBOX = \"bbox\"\n",
" ORIGIN = \"origin\"\n",
"\n",
"\n",
"class HierarchicalJSONNodeParser(NodeParser):\n",
"\n",
" def _parse_nodes(\n",
" self,\n",
" nodes: Sequence[BaseNode],\n",
" show_progress: bool = False,\n",
" **kwargs: Any,\n",
" ) -> list[BaseNode]:\n",
" nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(\n",
" items=nodes, show_progress=show_progress, desc=\"Parsing nodes\"\n",
" )\n",
" all_nodes: list[BaseNode] = []\n",
" chunker = HierarchicalChunker()\n",
" for input_node in nodes_with_progress:\n",
" li_doc = LIDocument.model_validate(input_node)\n",
" dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())\n",
" chunk_iter = chunker.chunk(dl_doc=dl_doc)\n",
" for chunk in chunk_iter:\n",
" rels: dict[NodeRelationship, RelatedNodeType] = {\n",
" NodeRelationship.SOURCE: li_doc.as_related_node_info(),\n",
" }\n",
" excl_doc_meta_keys = [d.value for d in DocMetaKeys]\n",
" excl_node_meta_keys = [n.value for n in NodeMetaKeys]\n",
" excl_meta_keys = excl_doc_meta_keys + excl_node_meta_keys\n",
" node = TextNode(\n",
" text=chunk.text,\n",
" excluded_embed_metadata_keys=excl_meta_keys,\n",
" excluded_llm_metadata_keys=excl_meta_keys,\n",
" relationships=rels,\n",
" )\n",
" node.metadata = {NodeMetaKeys.PATH: chunk.path}\n",
" if isinstance(chunk, ChunkWithMetadata):\n",
" node.metadata[NodeMetaKeys.PAGE] = chunk.page\n",
" node.metadata[NodeMetaKeys.BBOX] = chunk.bbox\n",
" all_nodes.append(node)\n",
" return all_nodes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Reader and node parser"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using JSON"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To leverage Docling's rich document structure format, we can namely export to JSON and use the HierarchicalJSONNodeParser accordingly:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)\n",
"node_parser = HierarchicalJSONNodeParser()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using Markdown"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# from llama_index.core.node_parser import MarkdownNodeParser\n",
"\n",
"# reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n",
"# node_parser = MarkdownNodeParser()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Transformations"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Our transformations currently include the `node_parser`:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"transformations = [node_parser]" "transformations = [node_parser]"
] ]
}, },
@ -163,7 +301,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -171,7 +309,7 @@
"\n", "\n",
"# splitter = TokenTextSplitter(\n", "# splitter = TokenTextSplitter(\n",
"# chunk_size=1024,\n", "# chunk_size=1024,\n",
"# chunk_overlap=20,\n", "# chunk_overlap=0,\n",
"# )\n", "# )\n",
"# transformations.append(splitter)" "# transformations.append(splitter)"
] ]
@ -185,13 +323,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
"\n", "\n",
"embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")" "embed_model = HuggingFaceEmbedding(model_name=\"intfloat/multilingual-e5-small\")"
] ]
}, },
{ {
@ -203,7 +341,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -212,7 +350,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -234,6 +372,7 @@
")\n", ")\n",
"MILVUS_COLL_NAME = os.environ.get(\"MILVUS_COLL_NAME\", \"basic_llamaindex_pipeline\")\n", "MILVUS_COLL_NAME = os.environ.get(\"MILVUS_COLL_NAME\", \"basic_llamaindex_pipeline\")\n",
"MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get(\"MILVUS_KWARGS\", \"{}\"))\n", "MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get(\"MILVUS_KWARGS\", \"{}\"))\n",
"\n",
"vector_store = MilvusVectorStore(\n", "vector_store = MilvusVectorStore(\n",
" uri=MILVUS_URL,\n", " uri=MILVUS_URL,\n",
" collection_name=MILVUS_COLL_NAME,\n", " collection_name=MILVUS_COLL_NAME,\n",
@ -245,35 +384,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "536daee038de4d52a793445c6d853c72",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{ {
"data": { "data": {
"text/html": [ "text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">[</span>\n", "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">[</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Document</span><span style=\"font-weight: bold\">(</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Document</span><span style=\"font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'83f7b6f1-33e3-493f-8240-95662a93d4dc'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span><span style=\"font-weight: bold\">}</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"font-weight: bold\">]</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[]</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"font-weight: bold\">]</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[]</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{}</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'## DocLayNet: A Large Human-Annotated Dataset for '</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">50593</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">173793</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
@ -287,13 +412,13 @@
"text/plain": [ "text/plain": [
"\u001b[1m[\u001b[0m\n", "\u001b[1m[\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[1;35mDocument\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1;35mDocument\u001b[0m\u001b[1m(\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'## DocLayNet: A Large Human-Annotated Dataset for '\u001b[0m+\u001b[1;36m50593\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"_name\":\"\",\"type\":\"pdf-document\",\"description\":\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"'\u001b[0m+\u001b[1;36m173793\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
@ -341,7 +466,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -364,75 +489,227 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/html": [ "text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Response</span><span style=\"font-weight: bold\">(</span>\n", "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Response</span><span style=\"font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">response</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'80863 pages were human annotated.'</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">response</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'80863 pages were annotated by humans.'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">source_nodes</span>=<span style=\"font-weight: bold\">[</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">source_nodes</span>=<span style=\"font-weight: bold\">[</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">NodeWithScore</span><span style=\"font-weight: bold\">(</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">NodeWithScore</span><span style=\"font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextNode</span><span style=\"font-weight: bold\">(</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextNode</span><span style=\"font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'8874a117-d181-4f4f-a30b-0b5604370d77'</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'79ee790b-73d8-4268-90d7-301b5cd5e8f4'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">}</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">]</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">]</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">}</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[36]'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'3 THE DOCLAYNET DATASET\\n\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape o'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5775</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>: <span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">317.11236572265625</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">116.19312286376953</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">559.7131958007812</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">202.27523803710938</span><span style=\"font-weight: bold\">]</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">]</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">]</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">NodeRelationship.SOURCE:</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'83f7b6f1-33e3-493f-8240-95662a93d4dc'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=&lt;ObjectType.DOCUMENT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'4'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\">&lt;NodeRelationship.PREVIOUS: </span><span style=\"color: #008000; text-decoration-color: #008000\">'2'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'5509c0ef-2890-4bba-aa0f-82c0c389a621'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=&lt;ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'d2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\">&lt;NodeRelationship.NEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'3'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'47f51f1f-e92f-4d82-b36e-466fa62f8e34'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=&lt;ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'</span><span style=\"color: #000000; text-decoration-color: #000000\">+</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">296</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.8344892859458923</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">NodeWithScore</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextNode</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'e1585b75-17f1-42b1-882a-f44e6ae4d382'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[75]'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">53.26631546020508</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">86.24749755859375</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">295.562255859375</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">215.95584106445312</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">]</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000\"> +</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">]</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000\"> +</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">]</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\">&lt;NodeRelationship.SOURCE: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'83f7b6f1-33e3-493f-8240-95662a93d4dc'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=&lt;ObjectType.DOCUMENT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'4'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\">&lt;NodeRelationship.PREVIOUS: </span><span style=\"color: #008000; text-decoration-color: #008000\">'2'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'964511c8-a412-47c4-8a3d-e4bf92edbda4'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=&lt;ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\">&lt;NodeRelationship.NEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'3'</span><span style=\"color: #000000; text-decoration-color: #000000\">&gt;: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=&lt;ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"font-weight: bold\">&gt;</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"font-weight: bold\">)</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">564</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">9089</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">15114</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"font-weight: bold\">)</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"font-weight: bold\">)</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.7367570400238037</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.8309065699577332</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">]</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">]</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'8874a117-d181-4f4f-a30b-0b5604370d77'</span>: <span style=\"font-weight: bold\">{</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'79ee790b-73d8-4268-90d7-301b5cd5e8f4'</span>: <span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[36]'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>: <span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">317.11236572265625</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">116.19312286376953</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">559.7131958007812</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">202.27523803710938</span><span style=\"font-weight: bold\">]</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">}</span>,\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'e1585b75-17f1-42b1-882a-f44e6ae4d382'</span>: <span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[75]'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>: <span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">53.26631546020508</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">86.24749755859375</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">295.562255859375</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">215.95584106445312</span><span style=\"font-weight: bold\">]</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">}</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">}</span>\n", "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">}</span>\n",
"<span style=\"font-weight: bold\">)</span>\n", "<span style=\"font-weight: bold\">)</span>\n",
"</pre>\n" "</pre>\n"
], ],
"text/plain": [ "text/plain": [
"\u001b[1;35mResponse\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[1;35mResponse\u001b[0m\u001b[1m(\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80863 pages were human annotated.'\u001b[0m,\n", "\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80863 pages were annotated by humans.'\u001b[0m,\n",
"\u001b[2;32m│ \u001b[0m\u001b[33msource_nodes\u001b[0m=\u001b[1m[\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[33msource_nodes\u001b[0m=\u001b[1m[\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'8874a117-d181-4f4f-a30b-0b5604370d77'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[33m...\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[33m...\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'3 THE DOCLAYNET DATASET\\n\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape o'\u001b[0m+\u001b[1;36m5775\u001b[0m,\n", "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m<\u001b[0m\u001b[1;95mNodeRelationship.SOURCE:\u001b[0m\u001b[39m \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.DOCUMENT: \u001b[0m\u001b[32m'4'\u001b[0m\u001b[39m>,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.PREVIOUS: \u001b[0m\u001b[32m'2'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'5509c0ef-2890-4bba-aa0f-82c0c389a621'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'd2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.NEXT: \u001b[0m\u001b[32m'3'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'47f51f1f-e92f-4d82-b36e-466fa62f8e34'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'\u001b[0m\u001b[39m+\u001b[0m\u001b[1;36m296\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'text/plain'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\\n'\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m\u001b[39m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;36m.8344892859458923\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1;39m(\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m\u001b[39m=\u001b[0m\u001b[1;35mTextNode\u001b[0m\u001b[1;39m(\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'e1585b75-17f1-42b1-882a-f44e6ae4d382'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m5\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m[\u001b[0m\u001b[1;36m53.26631546020508\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m86.24749755859375\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m295.562255859375\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m215.95584106445312\u001b[0m\u001b[1;39m]\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m, \u001b[0m\u001b[33m...\u001b[0m\u001b[39m +\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m, \u001b[0m\u001b[33m...\u001b[0m\u001b[39m +\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.SOURCE: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.DOCUMENT: \u001b[0m\u001b[32m'4'\u001b[0m\u001b[39m>,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.PREVIOUS: \u001b[0m\u001b[32m'2'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'964511c8-a412-47c4-8a3d-e4bf92edbda4'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.NEXT: \u001b[0m\u001b[32m'3'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd'\u001b[0m\u001b[39m,\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[1m>\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m=\u001b[32m'09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m)\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'\u001b[0m+\u001b[1;36m564\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[1;36m9089\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[1;36m15114\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.7367570400238037\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.8309065699577332\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n",
"\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[32m'8874a117-d181-4f4f-a30b-0b5604370d77'\u001b[0m: \u001b[1m{\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m: \u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[32m'e1585b75-17f1-42b1-882a-f44e6ae4d382'\u001b[0m: \u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m5\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m53.26631546020508\u001b[0m, \u001b[1;36m86.24749755859375\u001b[0m, \u001b[1;36m295.562255859375\u001b[0m, \u001b[1;36m215.95584106445312\u001b[0m\u001b[1m]\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n",
"\u001b[1m)\u001b[0m\n" "\u001b[1m)\u001b[0m\n"
] ]
@ -443,8 +720,8 @@
], ],
"source": [ "source": [
"query_engine = index.as_query_engine(llm=llm)\n", "query_engine = index.as_query_engine(llm=llm)\n",
"query_res = query_engine.query(\"How many pages were human annotated?\")\n", "query_res = query_engine.query(\"How many pages were annotated by humans?\")\n",
"pprint(query_res, max_length=1, max_string=250, max_depth=4)" "pprint(query_res, max_length=5, max_string=250, max_depth=6)"
] ]
}, },
{ {

6
poetry.lock generated
View File

@ -2530,13 +2530,13 @@ llama-index-core = ">=0.11.0,<0.12.0"
[[package]] [[package]]
name = "llama-index-vector-stores-milvus" name = "llama-index-vector-stores-milvus"
version = "0.2.3" version = "0.2.5"
description = "llama-index vector_stores milvus integration" description = "llama-index vector_stores milvus integration"
optional = false optional = false
python-versions = "<4.0,>=3.8.1" python-versions = "<4.0,>=3.8.1"
files = [ files = [
{file = "llama_index_vector_stores_milvus-0.2.3-py3-none-any.whl", hash = "sha256:287c3b2b8d886eac11b07db3ddf31b92dee55ac4f00fe7dc047879e2f7d79d67"}, {file = "llama_index_vector_stores_milvus-0.2.5-py3-none-any.whl", hash = "sha256:020d96dad541ee80c480d7ed6fe7587020a8c8a20d54b320fa376ef4dd8c8648"},
{file = "llama_index_vector_stores_milvus-0.2.3.tar.gz", hash = "sha256:3b2433869264f13aee752ee086f8741ea79222ebbaa0c437fa9fb21a8d56cdaf"}, {file = "llama_index_vector_stores_milvus-0.2.5.tar.gz", hash = "sha256:b68093f0bf2654dd6436678bb182e1c7f3e549efc4bc9e0f10039bc6b73f7b53"},
] ]
[package.dependencies] [package.dependencies]