From f4ee76eaecb821901fb694dc2612cb6af39dcc91 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 27 Sep 2024 19:31:43 +0200 Subject: [PATCH] chore: showcase extended metadata in LlamaIndex example Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- examples/rag_llamaindex.ipynb | 465 +++++++++++++++++++++++++++------- poetry.lock | 6 +- 2 files changed, 374 insertions(+), 97 deletions(-) diff --git a/examples/rag_llamaindex.ipynb b/examples/rag_llamaindex.ipynb index 6dd9e0f4..e29d0f7a 100644 --- a/examples/rag_llamaindex.ipynb +++ b/examples/rag_llamaindex.ipynb @@ -75,16 +75,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Reader and node parser" + "### Helpers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Below we set up:\n", - "- a `Reader` which will be used to create LlamaIndex documents, and\n", - "- a `NodeParser`, which will be used to create LlamaIndex nodes out of the documents" + "Below we define:\n", + "\n", + "- `DoclingPDFReader` which will be used to create LlamaIndex documents, and\n", + "- `HierarchicalJSONNodeParser`, which can be used to create LlamaIndex nodes out of JSON-based documents\n" ] }, { @@ -94,50 +95,54 @@ "outputs": [], "source": [ "from enum import Enum\n", - "from typing import Iterable\n", + "from pathlib import Path\n", + "from typing import Any, Iterable\n", "\n", "from llama_index.core.readers.base import BasePydanticReader\n", "from llama_index.core.schema import Document as LIDocument\n", - "from pydantic import BaseModel\n", "\n", "from docling.document_converter import DocumentConverter\n", "\n", - "\n", - "class DocumentMetadata(BaseModel):\n", - " dl_doc_hash: str\n", - "\n", + "class DocMetaKeys(str, Enum):\n", + " DL_DOC_HASH = \"dl_doc_hash\"\n", + " ORIGIN = \"origin\"\n", "\n", "class DoclingPDFReader(BasePydanticReader):\n", " class ParseType(str, Enum):\n", " MARKDOWN = \"markdown\"\n", - " # JSON = \"json\"\n", + " JSON = \"json\"\n", "\n", " parse_type: ParseType = ParseType.MARKDOWN\n", "\n", - " def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]:\n", - " file_paths = file_path if isinstance(file_path, list) else [file_path]\n", + " def lazy_load_data(\n", + " self,\n", + " file_path: str | Path | Iterable[str] | Iterable[Path],\n", + " *args: Any,\n", + " **load_kwargs: Any,\n", + " ) -> Iterable[LIDocument]:\n", + " file_paths = (\n", + " file_path\n", + " if isinstance(file_path, Iterable) and not isinstance(file_path, str)\n", + " else [file_path]\n", + " )\n", " converter = DocumentConverter()\n", " for source in file_paths:\n", " dl_doc = converter.convert_single(source).output\n", " match self.parse_type:\n", " case self.ParseType.MARKDOWN:\n", " text = dl_doc.export_to_markdown()\n", - " # case self.ParseType.JSON:\n", - " # text = dl_doc.model_dump_json()\n", + " case self.ParseType.JSON:\n", + " text = dl_doc.model_dump_json()\n", " case _:\n", " raise RuntimeError(\n", - " f\"Unexpected parse type encountered: {self.parse_type}\"\n", + " f\"Unexpected export type encountered: {self.export_type}\"\n", " )\n", - " excl_metadata_keys = [\"dl_doc_hash\"]\n", - " li_doc = LIDocument(\n", - " doc_id=dl_doc.file_info.document_hash,\n", - " text=text,\n", - " excluded_embed_metadata_keys=excl_metadata_keys,\n", - " excluded_llm_metadata_keys=excl_metadata_keys,\n", - " )\n", - " li_doc.metadata = DocumentMetadata(\n", - " dl_doc_hash=dl_doc.file_info.document_hash,\n", - " ).model_dump()\n", + " origin = str(source) if isinstance(source, Path) else source\n", + " li_doc = LIDocument(text=text)\n", + " li_doc.metadata = {\n", + " DocMetaKeys.DL_DOC_HASH: dl_doc.file_info.document_hash,\n", + " DocMetaKeys.ORIGIN: origin,\n", + " }\n", " yield li_doc" ] }, @@ -147,10 +152,143 @@ "metadata": {}, "outputs": [], "source": [ - "from llama_index.core.node_parser import MarkdownNodeParser\n", + "from typing import Any, Iterable, Sequence\n", "\n", - "reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n", - "node_parser = MarkdownNodeParser()\n", + "from docling_core.transforms.chunker import ChunkWithMetadata, HierarchicalChunker\n", + "from docling_core.types import Document as DLDocument\n", + "from llama_index.core import Document as LIDocument\n", + "from llama_index.core.node_parser.interface import NodeParser\n", + "from llama_index.core.schema import (\n", + " BaseNode,\n", + " NodeRelationship,\n", + " RelatedNodeType,\n", + " TextNode,\n", + ")\n", + "from llama_index.core.utils import get_tqdm_iterable\n", + "\n", + "\n", + "class NodeMetaKeys(str, Enum):\n", + " PATH = \"path\"\n", + " PAGE = \"page\"\n", + " BBOX = \"bbox\"\n", + " ORIGIN = \"origin\"\n", + "\n", + "\n", + "class HierarchicalJSONNodeParser(NodeParser):\n", + "\n", + " def _parse_nodes(\n", + " self,\n", + " nodes: Sequence[BaseNode],\n", + " show_progress: bool = False,\n", + " **kwargs: Any,\n", + " ) -> list[BaseNode]:\n", + " nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(\n", + " items=nodes, show_progress=show_progress, desc=\"Parsing nodes\"\n", + " )\n", + " all_nodes: list[BaseNode] = []\n", + " chunker = HierarchicalChunker()\n", + " for input_node in nodes_with_progress:\n", + " li_doc = LIDocument.model_validate(input_node)\n", + " dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())\n", + " chunk_iter = chunker.chunk(dl_doc=dl_doc)\n", + " for chunk in chunk_iter:\n", + " rels: dict[NodeRelationship, RelatedNodeType] = {\n", + " NodeRelationship.SOURCE: li_doc.as_related_node_info(),\n", + " }\n", + " excl_doc_meta_keys = [d.value for d in DocMetaKeys]\n", + " excl_node_meta_keys = [n.value for n in NodeMetaKeys]\n", + " excl_meta_keys = excl_doc_meta_keys + excl_node_meta_keys\n", + " node = TextNode(\n", + " text=chunk.text,\n", + " excluded_embed_metadata_keys=excl_meta_keys,\n", + " excluded_llm_metadata_keys=excl_meta_keys,\n", + " relationships=rels,\n", + " )\n", + " node.metadata = {NodeMetaKeys.PATH: chunk.path}\n", + " if isinstance(chunk, ChunkWithMetadata):\n", + " node.metadata[NodeMetaKeys.PAGE] = chunk.page\n", + " node.metadata[NodeMetaKeys.BBOX] = chunk.bbox\n", + " all_nodes.append(node)\n", + " return all_nodes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reader and node parser" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Using JSON" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To leverage Docling's rich document structure format, we can namely export to JSON and use the HierarchicalJSONNodeParser accordingly:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)\n", + "node_parser = HierarchicalJSONNodeParser()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Using Markdown" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# from llama_index.core.node_parser import MarkdownNodeParser\n", + "\n", + "# reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n", + "# node_parser = MarkdownNodeParser()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our transformations currently include the `node_parser`:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ "transformations = [node_parser]" ] }, @@ -163,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -171,7 +309,7 @@ "\n", "# splitter = TokenTextSplitter(\n", "# chunk_size=1024,\n", - "# chunk_overlap=20,\n", + "# chunk_overlap=0,\n", "# )\n", "# transformations.append(splitter)" ] @@ -185,13 +323,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", "\n", - "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")" + "embed_model = HuggingFaceEmbedding(model_name=\"intfloat/multilingual-e5-small\")" ] }, { @@ -203,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -212,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -234,6 +372,7 @@ ")\n", "MILVUS_COLL_NAME = os.environ.get(\"MILVUS_COLL_NAME\", \"basic_llamaindex_pipeline\")\n", "MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get(\"MILVUS_KWARGS\", \"{}\"))\n", + "\n", "vector_store = MilvusVectorStore(\n", " uri=MILVUS_URL,\n", " collection_name=MILVUS_COLL_NAME,\n", @@ -245,35 +384,21 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "536daee038de4d52a793445c6d853c72", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 7 files: 0%| | 0/7 [00:00[\n", "Document(\n", - "│ │ id_='5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'+14,\n", + "│ │ id_='83f7b6f1-33e3-493f-8240-95662a93d4dc',\n", "│ │ embedding=None,\n", - "│ │ metadata={'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'+14},\n", - "│ │ excluded_embed_metadata_keys=['dl_doc_hash'],\n", - "│ │ excluded_llm_metadata_keys=['dl_doc_hash'],\n", + "│ │ metadata={'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'+14, ... +1},\n", + "│ │ excluded_embed_metadata_keys=[],\n", + "│ │ excluded_llm_metadata_keys=[],\n", "│ │ relationships={},\n", - "│ │ text='## DocLayNet: A Large Human-Annotated Dataset for '+50593,\n", + "│ │ text='{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"'+173793,\n", "│ │ mimetype='text/plain',\n", "│ │ start_char_idx=None,\n", "│ │ end_char_idx=None,\n", @@ -287,13 +412,13 @@ "text/plain": [ "\u001b[1m[\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1;35mDocument\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m\u001b[1m}\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'## DocLayNet: A Large Human-Annotated Dataset for '\u001b[0m+\u001b[1;36m50593\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"_name\":\"\",\"type\":\"pdf-document\",\"description\":\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"'\u001b[0m+\u001b[1;36m173793\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", @@ -341,7 +466,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -364,75 +489,227 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Response(\n",
-       "response='80863 pages were human annotated.',\n",
+       "response='80863 pages were annotated by humans.',\n",
        "source_nodes=[\n",
        "│   │   NodeWithScore(\n",
        "│   │   │   node=TextNode(\n",
-       "│   │   │   │   id_='8874a117-d181-4f4f-a30b-0b5604370d77',\n",
+       "│   │   │   │   id_='79ee790b-73d8-4268-90d7-301b5cd5e8f4',\n",
        "│   │   │   │   embedding=None,\n",
-       "│   │   │   │   metadata={...},\n",
-       "│   │   │   │   excluded_embed_metadata_keys=[...],\n",
-       "│   │   │   │   excluded_llm_metadata_keys=[...],\n",
-       "│   │   │   │   relationships={...},\n",
-       "│   │   │   │   text='3 THE DOCLAYNET DATASET\\n\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape o'+5775,\n",
+       "│   │   │   │   metadata={\n",
+       "│   │   │   │   │   'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n",
+       "│   │   │   │   │   'origin': 'https://arxiv.org/pdf/2206.01062',\n",
+       "│   │   │   │   │   'path': '$.main-text[36]',\n",
+       "│   │   │   │   │   'page': 2,\n",
+       "│   │   │   │   │   'bbox': [317.11236572265625, 116.19312286376953, 559.7131958007812, 202.27523803710938]\n",
+       "│   │   │   │   },\n",
+       "│   │   │   │   excluded_embed_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n",
+       "│   │   │   │   excluded_llm_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n",
+       "│   │   │   │   relationships={\n",
+       "│   │   │   │   │   <NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(\n",
+       "│   │   │   │   │   │   node_id='83f7b6f1-33e3-493f-8240-95662a93d4dc',\n",
+       "│   │   │   │   │   │   node_type=<ObjectType.DOCUMENT: '4'>,\n",
+       "│   │   │   │   │   │   metadata={...},\n",
+       "│   │   │   │   │   │   hash='10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\n",
+       "│   │   │   │   │   ),\n",
+       "│   │   │   │   │   <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(\n",
+       "│   │   │   │   │   │   node_id='5509c0ef-2890-4bba-aa0f-82c0c389a621',\n",
+       "│   │   │   │   │   │   node_type=<ObjectType.TEXT: '1'>,\n",
+       "│   │   │   │   │   │   metadata={...},\n",
+       "│   │   │   │   │   │   hash='d2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'\n",
+       "│   │   │   │   │   ),\n",
+       "│   │   │   │   │   <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(\n",
+       "│   │   │   │   │   │   node_id='47f51f1f-e92f-4d82-b36e-466fa62f8e34',\n",
+       "│   │   │   │   │   │   node_type=<ObjectType.TEXT: '1'>,\n",
+       "│   │   │   │   │   │   metadata={...},\n",
+       "│   │   │   │   │   │   hash='df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'\n",
+       "│   │   │   │   │   )\n",
+       "│   │   │   │   },\n",
+       "│   │   │   │   text='3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'+296,\n",
+       "│   │   │   │   mimetype='text/plain',\n",
+       "│   │   │   │   start_char_idx=None,\n",
+       "│   │   │   │   end_char_idx=None,\n",
+       "│   │   │   │   text_template='{metadata_str}\\n\\n{content}',\n",
+       "│   │   │   │   metadata_template='{key}: {value}',\n",
+       "│   │   │   │   metadata_seperator='\\n'\n",
+       "│   │   │   ),\n",
+       "│   │   │   score=0.8344892859458923\n",
+       "│   │   ),\n",
+       "│   │   NodeWithScore(\n",
+       "│   │   │   node=TextNode(\n",
+       "│   │   │   │   id_='e1585b75-17f1-42b1-882a-f44e6ae4d382',\n",
+       "│   │   │   │   embedding=None,\n",
+       "│   │   │   │   metadata={\n",
+       "│   │   │   │   │   'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n",
+       "│   │   │   │   │   'origin': 'https://arxiv.org/pdf/2206.01062',\n",
+       "│   │   │   │   │   'path': '$.main-text[75]',\n",
+       "│   │   │   │   │   'page': 5,\n",
+       "│   │   │   │   │   'bbox': [53.26631546020508, 86.24749755859375, 295.562255859375, 215.95584106445312]\n",
+       "│   │   │   │   },\n",
+       "│   │   │   │   excluded_embed_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n",
+       "│   │   │   │   excluded_llm_metadata_keys=['dl_doc_hash', 'origin', 'path', 'page', 'bbox', ... +1],\n",
+       "│   │   │   │   relationships={\n",
+       "│   │   │   │   │   <NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(\n",
+       "│   │   │   │   │   │   node_id='83f7b6f1-33e3-493f-8240-95662a93d4dc',\n",
+       "│   │   │   │   │   │   node_type=<ObjectType.DOCUMENT: '4'>,\n",
+       "│   │   │   │   │   │   metadata={...},\n",
+       "│   │   │   │   │   │   hash='10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\n",
+       "│   │   │   │   │   ),\n",
+       "│   │   │   │   │   <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(\n",
+       "│   │   │   │   │   │   node_id='964511c8-a412-47c4-8a3d-e4bf92edbda4',\n",
+       "│   │   │   │   │   │   node_type=<ObjectType.TEXT: '1'>,\n",
+       "│   │   │   │   │   │   metadata={...},\n",
+       "│   │   │   │   │   │   hash='c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'\n",
+       "│   │   │   │   │   ),\n",
+       "│   │   │   │   │   <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(\n",
+       "│   │   │   │   │   │   node_id='9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd',\n",
+       "│   │   │   │   │   │   node_type=<ObjectType.TEXT: '1'>,\n",
+       "│   │   │   │   │   │   metadata={...},\n",
+       "│   │   │   │   │   │   hash='09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'\n",
+       "│   │   │   │   │   )\n",
+       "│   │   │   │   },\n",
+       "│   │   │   │   text='4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'+564,\n",
        "│   │   │   │   mimetype='text/plain',\n",
-       "│   │   │   │   start_char_idx=9089,\n",
-       "│   │   │   │   end_char_idx=15114,\n",
+       "│   │   │   │   start_char_idx=None,\n",
+       "│   │   │   │   end_char_idx=None,\n",
        "│   │   │   │   text_template='{metadata_str}\\n\\n{content}',\n",
        "│   │   │   │   metadata_template='{key}: {value}',\n",
        "│   │   │   │   metadata_seperator='\\n'\n",
        "│   │   │   ),\n",
-       "│   │   │   score=0.7367570400238037\n",
-       "│   │   ),\n",
-       "│   │   ... +1\n",
+       "│   │   │   score=0.8309065699577332\n",
+       "│   │   )\n",
        "],\n",
        "metadata={\n",
-       "│   │   '8874a117-d181-4f4f-a30b-0b5604370d77': {\n",
+       "│   │   '79ee790b-73d8-4268-90d7-301b5cd5e8f4': {\n",
        "│   │   │   'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n",
-       "│   │   │   ... +1\n",
+       "│   │   │   'origin': 'https://arxiv.org/pdf/2206.01062',\n",
+       "│   │   │   'path': '$.main-text[36]',\n",
+       "│   │   │   'page': 2,\n",
+       "│   │   │   'bbox': [317.11236572265625, 116.19312286376953, 559.7131958007812, 202.27523803710938]\n",
        "│   │   },\n",
-       "│   │   ... +1\n",
+       "│   │   'e1585b75-17f1-42b1-882a-f44e6ae4d382': {\n",
+       "│   │   │   'dl_doc_hash': '5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc',\n",
+       "│   │   │   'origin': 'https://arxiv.org/pdf/2206.01062',\n",
+       "│   │   │   'path': '$.main-text[75]',\n",
+       "│   │   │   'page': 5,\n",
+       "│   │   │   'bbox': [53.26631546020508, 86.24749755859375, 295.562255859375, 215.95584106445312]\n",
+       "│   │   }\n",
        "}\n",
        ")\n",
        "
\n" ], "text/plain": [ "\u001b[1;35mResponse\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80863 pages were human annotated.'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80863 pages were annotated by humans.'\u001b[0m,\n", "\u001b[2;32m│ \u001b[0m\u001b[33msource_nodes\u001b[0m=\u001b[1m[\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'8874a117-d181-4f4f-a30b-0b5604370d77'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[33m...\u001b[0m\u001b[1m]\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[33m...\u001b[0m\u001b[1m]\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'3 THE DOCLAYNET DATASET\\n\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape o'\u001b[0m+\u001b[1;36m5775\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m<\u001b[0m\u001b[1;95mNodeRelationship.SOURCE:\u001b[0m\u001b[39m \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'5509c0ef-2890-4bba-aa0f-82c0c389a621'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'd2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'47f51f1f-e92f-4d82-b36e-466fa62f8e34'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'\u001b[0m\u001b[39m+\u001b[0m\u001b[1;36m296\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'text/plain'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\\n'\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m\u001b[39m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;36m.8344892859458923\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1;39m(\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m\u001b[39m=\u001b[0m\u001b[1;35mTextNode\u001b[0m\u001b[1;39m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'e1585b75-17f1-42b1-882a-f44e6ae4d382'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m5\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m[\u001b[0m\u001b[1;36m53.26631546020508\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m86.24749755859375\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m295.562255859375\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m215.95584106445312\u001b[0m\u001b[1;39m]\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m, \u001b[0m\u001b[33m...\u001b[0m\u001b[39m +\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m, \u001b[0m\u001b[33m...\u001b[0m\u001b[39m +\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'964511c8-a412-47c4-8a3d-e4bf92edbda4'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m=\u001b[32m'09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'\u001b[0m+\u001b[1;36m564\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[1;36m9089\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[1;36m15114\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.7367570400238037\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.8309065699577332\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[32m'8874a117-d181-4f4f-a30b-0b5604370d77'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m: \u001b[1m{\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'e1585b75-17f1-42b1-882a-f44e6ae4d382'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m5\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m53.26631546020508\u001b[0m, \u001b[1;36m86.24749755859375\u001b[0m, \u001b[1;36m295.562255859375\u001b[0m, \u001b[1;36m215.95584106445312\u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n", "\u001b[1m)\u001b[0m\n" ] @@ -443,8 +720,8 @@ ], "source": [ "query_engine = index.as_query_engine(llm=llm)\n", - "query_res = query_engine.query(\"How many pages were human annotated?\")\n", - "pprint(query_res, max_length=1, max_string=250, max_depth=4)" + "query_res = query_engine.query(\"How many pages were annotated by humans?\")\n", + "pprint(query_res, max_length=5, max_string=250, max_depth=6)" ] }, { diff --git a/poetry.lock b/poetry.lock index 7733ecb7..d7ded2ca 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2530,13 +2530,13 @@ llama-index-core = ">=0.11.0,<0.12.0" [[package]] name = "llama-index-vector-stores-milvus" -version = "0.2.3" +version = "0.2.5" description = "llama-index vector_stores milvus integration" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index_vector_stores_milvus-0.2.3-py3-none-any.whl", hash = "sha256:287c3b2b8d886eac11b07db3ddf31b92dee55ac4f00fe7dc047879e2f7d79d67"}, - {file = "llama_index_vector_stores_milvus-0.2.3.tar.gz", hash = "sha256:3b2433869264f13aee752ee086f8741ea79222ebbaa0c437fa9fb21a8d56cdaf"}, + {file = "llama_index_vector_stores_milvus-0.2.5-py3-none-any.whl", hash = "sha256:020d96dad541ee80c480d7ed6fe7587020a8c8a20d54b320fa376ef4dd8c8648"}, + {file = "llama_index_vector_stores_milvus-0.2.5.tar.gz", hash = "sha256:b68093f0bf2654dd6436678bb182e1c7f3e549efc4bc9e0f10039bc6b73f7b53"}, ] [package.dependencies]