mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
minor notebook updates
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
6e16a2464e
commit
86fd560cfd
@ -74,7 +74,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Loader and splitter"
|
||||
"### Helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -225,6 +225,13 @@
|
||||
"FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Loader and splitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -420,10 +427,10 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Question:\n",
|
||||
"How many pages were human annotated for DocLayNet?\n",
|
||||
"How many pages were human annotated by humans for DocLayNet?\n",
|
||||
"\n",
|
||||
"Answer:\n",
|
||||
"\"80863 pages were human annotated for DocLayNet.\\nExplanation:\\nFrom the context, it is clear that DocL...\"\n",
|
||||
"\"80863 pages were annotated by humans in DocLayNet.\"\n",
|
||||
"\n",
|
||||
"Source 1:\n",
|
||||
" text: \"DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ...\"\n",
|
||||
@ -442,29 +449,36 @@
|
||||
" path: $.main-text[23]\n",
|
||||
"\n",
|
||||
"Source 3:\n",
|
||||
" text: \"Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11...\"\n",
|
||||
" bbox: [317.3695373535156, 82.78482818603516, 559.7149047851562, 244.83221435546875]\n",
|
||||
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||
" heading: 4 ANNOTATION CAMPAIGN\n",
|
||||
" page: 5\n",
|
||||
" path: $.main-text[80]\n",
|
||||
"\n",
|
||||
"Source 4:\n",
|
||||
" text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n",
|
||||
" bbox: [53.60108947753906, 723.3781127929688, 347.139892578125, 731.6909790039062]\n",
|
||||
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||
" heading: REFERENCES\n",
|
||||
" page: 9\n",
|
||||
" path: $.main-text[133]\n"
|
||||
" path: $.main-text[133]\n",
|
||||
"\n",
|
||||
"Source 4:\n",
|
||||
" text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n",
|
||||
" bbox: [53.542964935302734, 723.3500366210938, 347.0172424316406, 731.6931762695312]\n",
|
||||
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||
" heading: 4 ANNOTATION CAMPAIGN\n",
|
||||
" page: 5\n",
|
||||
" path: $.main-text[64]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"resp_dict = rag_chain.invoke(\n",
|
||||
" {\"input\": \"How many pages were human annotated for DocLayNet?\"}\n",
|
||||
" {\"input\": \"How many pages were human annotated by humans for DocLayNet?\"}\n",
|
||||
")\n",
|
||||
"print_qa(resp_dict=resp_dict)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -43,11 +43,8 @@
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from tempfile import TemporaryDirectory\n",
|
||||
"\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from pydantic import TypeAdapter\n",
|
||||
"from rich.pretty import pprint\n",
|
||||
"\n",
|
||||
"load_dotenv()"
|
||||
]
|
||||
@ -386,6 +383,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tempfile import TemporaryDirectory\n",
|
||||
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
||||
"\n",
|
||||
"MILVUS_URI = os.environ.get(\n",
|
||||
|
Loading…
Reference in New Issue
Block a user