mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
minor notebook updates
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
6e16a2464e
commit
86fd560cfd
@ -74,7 +74,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Loader and splitter"
|
"### Helpers"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -225,6 +225,13 @@
|
|||||||
"FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"
|
"FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Loader and splitter"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -420,10 +427,10 @@
|
|||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Question:\n",
|
"Question:\n",
|
||||||
"How many pages were human annotated for DocLayNet?\n",
|
"How many pages were human annotated by humans for DocLayNet?\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Answer:\n",
|
"Answer:\n",
|
||||||
"\"80863 pages were human annotated for DocLayNet.\\nExplanation:\\nFrom the context, it is clear that DocL...\"\n",
|
"\"80863 pages were annotated by humans in DocLayNet.\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Source 1:\n",
|
"Source 1:\n",
|
||||||
" text: \"DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ...\"\n",
|
" text: \"DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ...\"\n",
|
||||||
@ -442,29 +449,36 @@
|
|||||||
" path: $.main-text[23]\n",
|
" path: $.main-text[23]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Source 3:\n",
|
"Source 3:\n",
|
||||||
" text: \"Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11...\"\n",
|
|
||||||
" bbox: [317.3695373535156, 82.78482818603516, 559.7149047851562, 244.83221435546875]\n",
|
|
||||||
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
|
||||||
" heading: 4 ANNOTATION CAMPAIGN\n",
|
|
||||||
" page: 5\n",
|
|
||||||
" path: $.main-text[80]\n",
|
|
||||||
"\n",
|
|
||||||
"Source 4:\n",
|
|
||||||
" text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n",
|
" text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n",
|
||||||
" bbox: [53.60108947753906, 723.3781127929688, 347.139892578125, 731.6909790039062]\n",
|
" bbox: [53.60108947753906, 723.3781127929688, 347.139892578125, 731.6909790039062]\n",
|
||||||
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||||
" heading: REFERENCES\n",
|
" heading: REFERENCES\n",
|
||||||
" page: 9\n",
|
" page: 9\n",
|
||||||
" path: $.main-text[133]\n"
|
" path: $.main-text[133]\n",
|
||||||
|
"\n",
|
||||||
|
"Source 4:\n",
|
||||||
|
" text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n",
|
||||||
|
" bbox: [53.542964935302734, 723.3500366210938, 347.0172424316406, 731.6931762695312]\n",
|
||||||
|
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||||
|
" heading: 4 ANNOTATION CAMPAIGN\n",
|
||||||
|
" page: 5\n",
|
||||||
|
" path: $.main-text[64]\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"resp_dict = rag_chain.invoke(\n",
|
"resp_dict = rag_chain.invoke(\n",
|
||||||
" {\"input\": \"How many pages were human annotated for DocLayNet?\"}\n",
|
" {\"input\": \"How many pages were human annotated by humans for DocLayNet?\"}\n",
|
||||||
")\n",
|
")\n",
|
||||||
"print_qa(resp_dict=resp_dict)"
|
"print_qa(resp_dict=resp_dict)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -43,11 +43,8 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"from tempfile import TemporaryDirectory\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"from dotenv import load_dotenv\n",
|
"from dotenv import load_dotenv\n",
|
||||||
"from pydantic import TypeAdapter\n",
|
|
||||||
"from rich.pretty import pprint\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"load_dotenv()"
|
"load_dotenv()"
|
||||||
]
|
]
|
||||||
@ -386,6 +383,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"from tempfile import TemporaryDirectory\n",
|
||||||
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
||||||
"\n",
|
"\n",
|
||||||
"MILVUS_URI = os.environ.get(\n",
|
"MILVUS_URI = os.environ.get(\n",
|
||||||
|
Loading…
Reference in New Issue
Block a user