minor notebook updates

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-10-04 14:50:38 +02:00
parent 6e16a2464e
commit 86fd560cfd
2 changed files with 28 additions and 16 deletions

View File

@ -74,7 +74,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Loader and splitter"
"### Helpers"
]
},
{
@ -225,6 +225,13 @@
"FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Loader and splitter"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -420,10 +427,10 @@
"output_type": "stream",
"text": [
"Question:\n",
"How many pages were human annotated for DocLayNet?\n",
"How many pages were human annotated by humans for DocLayNet?\n",
"\n",
"Answer:\n",
"\"80863 pages were human annotated for DocLayNet.\\nExplanation:\\nFrom the context, it is clear that DocL...\"\n",
"\"80863 pages were annotated by humans in DocLayNet.\"\n",
"\n",
"Source 1:\n",
" text: \"DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ...\"\n",
@ -442,29 +449,36 @@
" path: $.main-text[23]\n",
"\n",
"Source 3:\n",
" text: \"Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11...\"\n",
" bbox: [317.3695373535156, 82.78482818603516, 559.7149047851562, 244.83221435546875]\n",
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
" heading: 4 ANNOTATION CAMPAIGN\n",
" page: 5\n",
" path: $.main-text[80]\n",
"\n",
"Source 4:\n",
" text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n",
" bbox: [53.60108947753906, 723.3781127929688, 347.139892578125, 731.6909790039062]\n",
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
" heading: REFERENCES\n",
" page: 9\n",
" path: $.main-text[133]\n"
" path: $.main-text[133]\n",
"\n",
"Source 4:\n",
" text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n",
" bbox: [53.542964935302734, 723.3500366210938, 347.0172424316406, 731.6931762695312]\n",
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
" heading: 4 ANNOTATION CAMPAIGN\n",
" page: 5\n",
" path: $.main-text[64]\n"
]
}
],
"source": [
"resp_dict = rag_chain.invoke(\n",
" {\"input\": \"How many pages were human annotated for DocLayNet?\"}\n",
" {\"input\": \"How many pages were human annotated by humans for DocLayNet?\"}\n",
")\n",
"print_qa(resp_dict=resp_dict)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@ -43,11 +43,8 @@
],
"source": [
"import os\n",
"from tempfile import TemporaryDirectory\n",
"\n",
"from dotenv import load_dotenv\n",
"from pydantic import TypeAdapter\n",
"from rich.pretty import pprint\n",
"\n",
"load_dotenv()"
]
@ -386,6 +383,7 @@
"metadata": {},
"outputs": [],
"source": [
"from tempfile import TemporaryDirectory\n",
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
"\n",
"MILVUS_URI = os.environ.get(\n",