mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
add docling splitter to LC example, simplify & align QA output
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
f4ee76eaec
commit
6e16a2464e
@ -22,7 +22,7 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# requirements for this example:\n",
|
"# requirements for this example:\n",
|
||||||
"%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus"
|
"%pip install -qq docling docling-core python-dotenv langchain langchain-text-splitters langchain-huggingface langchain-milvus"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -58,7 +58,9 @@
|
|||||||
"import warnings\n",
|
"import warnings\n",
|
||||||
"\n",
|
"\n",
|
||||||
"warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
|
"warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
|
||||||
"warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"
|
"warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")\n",
|
||||||
|
"# https://github.com/huggingface/transformers/issues/5486:\n",
|
||||||
|
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -80,8 +82,9 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Below we set up:\n",
|
"Below we set up:\n",
|
||||||
"- a `Loader` which will be used to create LangChain documents, and\n",
|
"- a `Loader` which will be used to create LangChain documents,\n",
|
||||||
"- a splitter, which will be used to split these documents"
|
"- a splitter, which will be used to split these documents, and\n",
|
||||||
|
"- a helper function for QA printing"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -95,20 +98,19 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"from langchain_core.document_loaders import BaseLoader\n",
|
"from langchain_core.document_loaders import BaseLoader\n",
|
||||||
"from langchain_core.documents import Document as LCDocument\n",
|
"from langchain_core.documents import Document as LCDocument\n",
|
||||||
"from pydantic import BaseModel\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"from docling.document_converter import DocumentConverter\n",
|
"from docling.document_converter import DocumentConverter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"_KEY_DL_DOC_HASH = \"dl_doc_hash\"\n",
|
||||||
"class DocumentMetadata(BaseModel):\n",
|
"_KEY_ORIGIN = \"origin\"\n",
|
||||||
" dl_doc_hash: str\n",
|
|
||||||
" # source: str\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class DoclingPDFLoader(BaseLoader):\n",
|
"class DoclingPDFLoader(BaseLoader):\n",
|
||||||
" class ParseType(str, Enum):\n",
|
" class ParseType(str, Enum):\n",
|
||||||
" MARKDOWN = \"markdown\"\n",
|
" MARKDOWN = \"markdown\"\n",
|
||||||
" # JSON = \"json\"\n",
|
" JSON = \"json\"\n",
|
||||||
|
"\n",
|
||||||
|
" include_origin: bool = False\n",
|
||||||
"\n",
|
"\n",
|
||||||
" def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n",
|
" def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n",
|
||||||
" self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
|
" self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
|
||||||
@ -121,17 +123,21 @@
|
|||||||
" match self._parse_type:\n",
|
" match self._parse_type:\n",
|
||||||
" case self.ParseType.MARKDOWN:\n",
|
" case self.ParseType.MARKDOWN:\n",
|
||||||
" text = dl_doc.export_to_markdown()\n",
|
" text = dl_doc.export_to_markdown()\n",
|
||||||
" # case self.ParseType.JSON:\n",
|
" case self.ParseType.JSON:\n",
|
||||||
" # text = dl_doc.model_dump_json()\n",
|
" text = dl_doc.model_dump_json()\n",
|
||||||
" case _:\n",
|
" case _:\n",
|
||||||
" raise RuntimeError(\n",
|
" raise RuntimeError(\n",
|
||||||
" f\"Unexpected parse type encountered: {self._parse_type}\"\n",
|
" f\"Unexpected parse type encountered: {self._parse_type}\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
|
" metadata = {\n",
|
||||||
|
" _KEY_DL_DOC_HASH: dl_doc.file_info.document_hash,\n",
|
||||||
|
" }\n",
|
||||||
|
" if self.include_origin:\n",
|
||||||
|
" metadata[_KEY_ORIGIN] = source\n",
|
||||||
|
"\n",
|
||||||
" lc_doc = LCDocument(\n",
|
" lc_doc = LCDocument(\n",
|
||||||
" page_content=text,\n",
|
" page_content=text,\n",
|
||||||
" metadata=DocumentMetadata(\n",
|
" metadata=metadata,\n",
|
||||||
" dl_doc_hash=dl_doc.file_info.document_hash,\n",
|
|
||||||
" ).model_dump(),\n",
|
|
||||||
" )\n",
|
" )\n",
|
||||||
" yield lc_doc"
|
" yield lc_doc"
|
||||||
]
|
]
|
||||||
@ -142,40 +148,140 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"
|
"import json\n",
|
||||||
|
"from typing import Iterable, List\n",
|
||||||
|
"\n",
|
||||||
|
"from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker\n",
|
||||||
|
"from docling_core.types import Document as DLDocument\n",
|
||||||
|
"from langchain_core.documents import Document as LCDocument\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"class DoclingSplitter:\n",
|
||||||
|
"\n",
|
||||||
|
" def __init__(\n",
|
||||||
|
" self,\n",
|
||||||
|
" chunker: BaseChunker | None = None,\n",
|
||||||
|
" ) -> None:\n",
|
||||||
|
" self.chunker: BaseChunker = chunker or HierarchicalChunker(\n",
|
||||||
|
" heading_as_metadata=True\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" def split_documents(self, documents: Iterable[LCDocument]) -> List[LCDocument]:\n",
|
||||||
|
"\n",
|
||||||
|
" all_chunk_docs: list[LCDocument] = []\n",
|
||||||
|
" for doc in documents:\n",
|
||||||
|
" lc_doc: LCDocument = LCDocument.parse_obj(doc)\n",
|
||||||
|
" dl_doc: DLDocument = DLDocument.model_validate_json(lc_doc.page_content)\n",
|
||||||
|
" chunk_iter = self.chunker.chunk(dl_doc=dl_doc)\n",
|
||||||
|
" for chunk in chunk_iter:\n",
|
||||||
|
" chunk_metadata = chunk.model_dump(\n",
|
||||||
|
" exclude=\"text\",\n",
|
||||||
|
" exclude_none=True,\n",
|
||||||
|
" )\n",
|
||||||
|
" metadata = {**lc_doc.metadata, **chunk_metadata}\n",
|
||||||
|
" for k, v in metadata.items():\n",
|
||||||
|
" if isinstance(v, Iterable) and not isinstance(v, str):\n",
|
||||||
|
" metadata[k] = json.dumps(v)\n",
|
||||||
|
" chunk_doc = LCDocument(\n",
|
||||||
|
" page_content=chunk.text,\n",
|
||||||
|
" metadata=metadata,\n",
|
||||||
|
" )\n",
|
||||||
|
" all_chunk_docs.append(chunk_doc)\n",
|
||||||
|
"\n",
|
||||||
|
" return all_chunk_docs"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
|
||||||
"model_id": "1b38d07d5fed4618a44ecf261e1e5c44",
|
|
||||||
"version_major": 2,
|
|
||||||
"version_minor": 0
|
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
"def print_qa(resp_dict):\n",
|
||||||
|
" def clip(inp, max_len=100):\n",
|
||||||
|
" if isinstance(inp, str):\n",
|
||||||
|
" return f\"{inp[:max_len]}{'...' if len(inp) > max_len else ''}\"\n",
|
||||||
|
" else:\n",
|
||||||
|
" return inp\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
" print(\n",
|
||||||
|
" f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{json.dumps(clip(resp_dict['answer']))}\"\n",
|
||||||
|
" )\n",
|
||||||
|
" for i, doc in enumerate(resp_dict[\"context\"]):\n",
|
||||||
|
" print()\n",
|
||||||
|
" print(f\"Source {i+1}:\")\n",
|
||||||
|
" print(f\" text: {json.dumps(clip(doc.page_content))}\")\n",
|
||||||
|
" for key in doc.metadata:\n",
|
||||||
|
" if key != \"pk\":\n",
|
||||||
|
" print(f\" {key}: {clip(doc.metadata.get(key))}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**Using native Docling format (as JSON)**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"To leverage Docling's rich document structure format, we can namely export to JSON and use the `DoclingSplitter` accordingly:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
"loader = DoclingPDFLoader(\n",
|
"loader = DoclingPDFLoader(\n",
|
||||||
" file_path=FILE_PATH,\n",
|
" file_path=FILE_PATH,\n",
|
||||||
" parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n",
|
" parse_type=DoclingPDFLoader.ParseType.JSON,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"text_splitter = RecursiveCharacterTextSplitter(\n",
|
"splitter = DoclingSplitter()"
|
||||||
" chunk_size=1000,\n",
|
]
|
||||||
" chunk_overlap=200,\n",
|
},
|
||||||
")"
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**Using Markdown:**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
||||||
|
"\n",
|
||||||
|
"# loader = DoclingPDFLoader(\n",
|
||||||
|
"# file_path=FILE_PATH,\n",
|
||||||
|
"# parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n",
|
||||||
|
"# )\n",
|
||||||
|
"# splitter = RecursiveCharacterTextSplitter(\n",
|
||||||
|
"# chunk_size=1000,\n",
|
||||||
|
"# chunk_overlap=200,\n",
|
||||||
|
"# )"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -187,31 +293,31 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"docs = loader.load()\n",
|
"docs = loader.load()\n",
|
||||||
"splits = text_splitter.split_documents(docs)"
|
"splits = splitter.split_documents(docs)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Embeddings"
|
"### Embed model"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 11,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
|
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
|
||||||
"\n",
|
"\n",
|
||||||
"HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
|
"HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
|
||||||
"embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"
|
"embedding = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -223,7 +329,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 12,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -232,13 +338,14 @@
|
|||||||
"from langchain_milvus import Milvus\n",
|
"from langchain_milvus import Milvus\n",
|
||||||
"\n",
|
"\n",
|
||||||
"MILVUS_URI = os.environ.get(\n",
|
"MILVUS_URI = os.environ.get(\n",
|
||||||
" \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
|
" \"MILVUS_URI\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"vectorstore = Milvus.from_documents(\n",
|
"vectorstore = Milvus.from_documents(\n",
|
||||||
" splits,\n",
|
" splits,\n",
|
||||||
" embeddings,\n",
|
" embedding,\n",
|
||||||
" connection_args={\"uri\": MILVUS_URI},\n",
|
" connection_args={\"uri\": MILVUS_URI},\n",
|
||||||
|
" collection_name=\"docling_lc_demo\",\n",
|
||||||
" drop_old=True,\n",
|
" drop_old=True,\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
@ -252,7 +359,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 13,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -287,62 +394,77 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 14,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from typing import Iterable\n",
|
"from langchain.chains import create_retrieval_chain\n",
|
||||||
"\n",
|
"from langchain.chains.combine_documents import create_stuff_documents_chain\n",
|
||||||
"from langchain_core.documents import Document as LCDocument\n",
|
|
||||||
"from langchain_core.output_parsers import StrOutputParser\n",
|
|
||||||
"from langchain_core.prompts import PromptTemplate\n",
|
"from langchain_core.prompts import PromptTemplate\n",
|
||||||
"from langchain_core.runnables import RunnablePassthrough\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def format_docs(docs: Iterable[LCDocument]):\n",
|
|
||||||
" return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"retriever = vectorstore.as_retriever()\n",
|
"retriever = vectorstore.as_retriever()\n",
|
||||||
"\n",
|
|
||||||
"prompt = PromptTemplate.from_template(\n",
|
"prompt = PromptTemplate.from_template(\n",
|
||||||
" \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n",
|
" \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {input}\\nAnswer:\\n\"\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"question_answer_chain = create_stuff_documents_chain(llm, prompt)\n",
|
||||||
"rag_chain = (\n",
|
"rag_chain = create_retrieval_chain(retriever, question_answer_chain)"
|
||||||
" {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
|
|
||||||
" | prompt\n",
|
|
||||||
" | llm\n",
|
|
||||||
" | StrOutputParser()\n",
|
|
||||||
")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 15,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"name": "stdout",
|
||||||
"text/plain": [
|
"output_type": "stream",
|
||||||
"'The human annotation of DocLayNet was performed on 80863 pages.\\n\\nExplanation:\\nThe information is found in the paragraph \"DocLayNet contains 80863 PDF pages\" in the context.'"
|
"text": [
|
||||||
]
|
"Question:\n",
|
||||||
},
|
"How many pages were human annotated for DocLayNet?\n",
|
||||||
"execution_count": 12,
|
"\n",
|
||||||
"metadata": {},
|
"Answer:\n",
|
||||||
"output_type": "execute_result"
|
"\"80863 pages were human annotated for DocLayNet.\\nExplanation:\\nFrom the context, it is clear that DocL...\"\n",
|
||||||
|
"\n",
|
||||||
|
"Source 1:\n",
|
||||||
|
" text: \"DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ...\"\n",
|
||||||
|
" bbox: [317.2852478027344, 116.46983337402344, 559.7131958007812, 201.73675537109375]\n",
|
||||||
|
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||||
|
" heading: 3 THE DOCLAYNET DATASET\n",
|
||||||
|
" page: 2\n",
|
||||||
|
" path: $.main-text[37]\n",
|
||||||
|
"\n",
|
||||||
|
"Source 2:\n",
|
||||||
|
" text: \"In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-tr...\"\n",
|
||||||
|
" bbox: [53.50020980834961, 212.36782836914062, 295.56396484375, 286.4964599609375]\n",
|
||||||
|
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||||
|
" heading: 1 INTRODUCTION\n",
|
||||||
|
" page: 2\n",
|
||||||
|
" path: $.main-text[23]\n",
|
||||||
|
"\n",
|
||||||
|
"Source 3:\n",
|
||||||
|
" text: \"Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11...\"\n",
|
||||||
|
" bbox: [317.3695373535156, 82.78482818603516, 559.7149047851562, 244.83221435546875]\n",
|
||||||
|
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||||
|
" heading: 4 ANNOTATION CAMPAIGN\n",
|
||||||
|
" page: 5\n",
|
||||||
|
" path: $.main-text[80]\n",
|
||||||
|
"\n",
|
||||||
|
"Source 4:\n",
|
||||||
|
" text: \"DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\"\n",
|
||||||
|
" bbox: [53.60108947753906, 723.3781127929688, 347.139892578125, 731.6909790039062]\n",
|
||||||
|
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||||
|
" heading: REFERENCES\n",
|
||||||
|
" page: 9\n",
|
||||||
|
" path: $.main-text[133]\n"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")"
|
"resp_dict = rag_chain.invoke(\n",
|
||||||
|
" {\"input\": \"How many pages were human annotated for DocLayNet?\"}\n",
|
||||||
|
")\n",
|
||||||
|
"print_qa(resp_dict=resp_dict)"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -61,7 +61,9 @@
|
|||||||
"import warnings\n",
|
"import warnings\n",
|
||||||
"\n",
|
"\n",
|
||||||
"warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
|
"warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
|
||||||
"warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"
|
"warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")\n",
|
||||||
|
"# https://github.com/huggingface/transformers/issues/5486:\n",
|
||||||
|
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -84,8 +86,9 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"Below we define:\n",
|
"Below we define:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"- `DoclingPDFReader` which will be used to create LlamaIndex documents, and\n",
|
"- `DoclingPDFReader` which will be used to create LlamaIndex documents,\n",
|
||||||
"- `HierarchicalJSONNodeParser`, which can be used to create LlamaIndex nodes out of JSON-based documents\n"
|
"- `DoclingNodeParser`, which can be used to create LlamaIndex nodes out of JSON-based documents, and\n",
|
||||||
|
"- a helper function for QA printing"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -103,9 +106,9 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"from docling.document_converter import DocumentConverter\n",
|
"from docling.document_converter import DocumentConverter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class DocMetaKeys(str, Enum):\n",
|
"_KEY_DL_DOC_HASH = \"dl_doc_hash\"\n",
|
||||||
" DL_DOC_HASH = \"dl_doc_hash\"\n",
|
"_KEY_ORIGIN = \"origin\"\n",
|
||||||
" ORIGIN = \"origin\"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class DoclingPDFReader(BasePydanticReader):\n",
|
"class DoclingPDFReader(BasePydanticReader):\n",
|
||||||
" class ParseType(str, Enum):\n",
|
" class ParseType(str, Enum):\n",
|
||||||
@ -113,6 +116,7 @@
|
|||||||
" JSON = \"json\"\n",
|
" JSON = \"json\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
" parse_type: ParseType = ParseType.MARKDOWN\n",
|
" parse_type: ParseType = ParseType.MARKDOWN\n",
|
||||||
|
" include_origin: bool = False\n",
|
||||||
"\n",
|
"\n",
|
||||||
" def lazy_load_data(\n",
|
" def lazy_load_data(\n",
|
||||||
" self,\n",
|
" self,\n",
|
||||||
@ -140,9 +144,10 @@
|
|||||||
" origin = str(source) if isinstance(source, Path) else source\n",
|
" origin = str(source) if isinstance(source, Path) else source\n",
|
||||||
" li_doc = LIDocument(text=text)\n",
|
" li_doc = LIDocument(text=text)\n",
|
||||||
" li_doc.metadata = {\n",
|
" li_doc.metadata = {\n",
|
||||||
" DocMetaKeys.DL_DOC_HASH: dl_doc.file_info.document_hash,\n",
|
" _KEY_DL_DOC_HASH: dl_doc.file_info.document_hash,\n",
|
||||||
" DocMetaKeys.ORIGIN: origin,\n",
|
|
||||||
" }\n",
|
" }\n",
|
||||||
|
" if self.include_origin:\n",
|
||||||
|
" li_doc.metadata[_KEY_ORIGIN] = origin\n",
|
||||||
" yield li_doc"
|
" yield li_doc"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -154,10 +159,11 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from typing import Any, Iterable, Sequence\n",
|
"from typing import Any, Iterable, Sequence\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from docling_core.transforms.chunker import ChunkWithMetadata, HierarchicalChunker\n",
|
"from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker\n",
|
||||||
"from docling_core.types import Document as DLDocument\n",
|
"from docling_core.types import Document as DLDocument\n",
|
||||||
"from llama_index.core import Document as LIDocument\n",
|
"from llama_index.core import Document as LIDocument\n",
|
||||||
"from llama_index.core.node_parser.interface import NodeParser\n",
|
"from llama_index.core.node_parser.interface import NodeParser\n",
|
||||||
|
"from llama_index.core.node_parser.node_utils import IdFuncCallable, default_id_func\n",
|
||||||
"from llama_index.core.schema import (\n",
|
"from llama_index.core.schema import (\n",
|
||||||
" BaseNode,\n",
|
" BaseNode,\n",
|
||||||
" NodeRelationship,\n",
|
" NodeRelationship,\n",
|
||||||
@ -167,14 +173,8 @@
|
|||||||
"from llama_index.core.utils import get_tqdm_iterable\n",
|
"from llama_index.core.utils import get_tqdm_iterable\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class NodeMetaKeys(str, Enum):\n",
|
"class DoclingNodeParser(NodeParser):\n",
|
||||||
" PATH = \"path\"\n",
|
" chunker: BaseChunker = HierarchicalChunker(heading_as_metadata=True)\n",
|
||||||
" PAGE = \"page\"\n",
|
|
||||||
" BBOX = \"bbox\"\n",
|
|
||||||
" ORIGIN = \"origin\"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"class HierarchicalJSONNodeParser(NodeParser):\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" def _parse_nodes(\n",
|
" def _parse_nodes(\n",
|
||||||
" self,\n",
|
" self,\n",
|
||||||
@ -182,36 +182,68 @@
|
|||||||
" show_progress: bool = False,\n",
|
" show_progress: bool = False,\n",
|
||||||
" **kwargs: Any,\n",
|
" **kwargs: Any,\n",
|
||||||
" ) -> list[BaseNode]:\n",
|
" ) -> list[BaseNode]:\n",
|
||||||
|
" id_func: IdFuncCallable = self.id_func or default_id_func\n",
|
||||||
" nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(\n",
|
" nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(\n",
|
||||||
" items=nodes, show_progress=show_progress, desc=\"Parsing nodes\"\n",
|
" items=nodes, show_progress=show_progress, desc=\"Parsing nodes\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" all_nodes: list[BaseNode] = []\n",
|
" all_nodes: list[BaseNode] = []\n",
|
||||||
" chunker = HierarchicalChunker()\n",
|
|
||||||
" for input_node in nodes_with_progress:\n",
|
" for input_node in nodes_with_progress:\n",
|
||||||
" li_doc = LIDocument.model_validate(input_node)\n",
|
" li_doc = LIDocument.model_validate(input_node)\n",
|
||||||
" dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())\n",
|
" dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())\n",
|
||||||
" chunk_iter = chunker.chunk(dl_doc=dl_doc)\n",
|
" chunk_iter = self.chunker.chunk(dl_doc=dl_doc)\n",
|
||||||
" for chunk in chunk_iter:\n",
|
" for i, chunk in enumerate(chunk_iter):\n",
|
||||||
" rels: dict[NodeRelationship, RelatedNodeType] = {\n",
|
" rels: dict[NodeRelationship, RelatedNodeType] = {\n",
|
||||||
" NodeRelationship.SOURCE: li_doc.as_related_node_info(),\n",
|
" NodeRelationship.SOURCE: li_doc.as_related_node_info(),\n",
|
||||||
" }\n",
|
" }\n",
|
||||||
" excl_doc_meta_keys = [d.value for d in DocMetaKeys]\n",
|
" metadata = chunk.model_dump(\n",
|
||||||
" excl_node_meta_keys = [n.value for n in NodeMetaKeys]\n",
|
" exclude=\"text\",\n",
|
||||||
" excl_meta_keys = excl_doc_meta_keys + excl_node_meta_keys\n",
|
" exclude_none=True,\n",
|
||||||
|
" )\n",
|
||||||
|
" # by default we exclude all meta keys from embedding/LLM — unless allowed\n",
|
||||||
|
" excl_meta_keys = [k for k in metadata if k not in {\"heading\"}]\n",
|
||||||
|
" if self.include_metadata:\n",
|
||||||
|
" excl_meta_keys = [k for k in li_doc.metadata] + excl_meta_keys\n",
|
||||||
" node = TextNode(\n",
|
" node = TextNode(\n",
|
||||||
|
" id_=id_func(i=i, doc=li_doc),\n",
|
||||||
" text=chunk.text,\n",
|
" text=chunk.text,\n",
|
||||||
" excluded_embed_metadata_keys=excl_meta_keys,\n",
|
" excluded_embed_metadata_keys=excl_meta_keys,\n",
|
||||||
" excluded_llm_metadata_keys=excl_meta_keys,\n",
|
" excluded_llm_metadata_keys=excl_meta_keys,\n",
|
||||||
" relationships=rels,\n",
|
" relationships=rels,\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" node.metadata = {NodeMetaKeys.PATH: chunk.path}\n",
|
" node.metadata = metadata\n",
|
||||||
" if isinstance(chunk, ChunkWithMetadata):\n",
|
|
||||||
" node.metadata[NodeMetaKeys.PAGE] = chunk.page\n",
|
|
||||||
" node.metadata[NodeMetaKeys.BBOX] = chunk.bbox\n",
|
|
||||||
" all_nodes.append(node)\n",
|
" all_nodes.append(node)\n",
|
||||||
" return all_nodes"
|
" return all_nodes"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"\n",
|
||||||
|
"from llama_index.core.base.response.schema import RESPONSE_TYPE\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def print_qa(query: str, query_res: RESPONSE_TYPE):\n",
|
||||||
|
" def clip(inp, max_len=100):\n",
|
||||||
|
" if isinstance(inp, str):\n",
|
||||||
|
" return f\"{inp[:max_len]}{'...' if len(inp) > max_len else ''}\"\n",
|
||||||
|
" else:\n",
|
||||||
|
" return inp\n",
|
||||||
|
"\n",
|
||||||
|
" print(\n",
|
||||||
|
" f\"Question:\\n{query}\\n\\nAnswer:\\n{json.dumps(clip(query_res.response.strip()))}\"\n",
|
||||||
|
" )\n",
|
||||||
|
" for i, res in enumerate(query_res.source_nodes):\n",
|
||||||
|
" print()\n",
|
||||||
|
" print(f\"Source {i+1}:\")\n",
|
||||||
|
" print(f\" text: {json.dumps(clip(res.text.strip()))}\")\n",
|
||||||
|
" for key in res.metadata:\n",
|
||||||
|
" print(f\" {key}: {clip(res.metadata.get(key))}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -223,31 +255,31 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"#### Using JSON"
|
"**Using native Docling format (as JSON)**"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"To leverage Docling's rich document structure format, we can namely export to JSON and use the HierarchicalJSONNodeParser accordingly:"
|
"To leverage Docling's rich document structure format, we can namely export to JSON and use the `DoclingNodeParser` accordingly:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)\n",
|
"reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)\n",
|
||||||
"node_parser = HierarchicalJSONNodeParser()"
|
"node_parser = DoclingNodeParser()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"#### Using Markdown"
|
"**Using Markdown**"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -259,7 +291,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -285,7 +317,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 9,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -301,7 +333,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -323,7 +355,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 11,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -341,7 +373,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 12,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -350,89 +382,29 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 13,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
|
||||||
"To disable this warning, you can either:\n",
|
|
||||||
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
|
||||||
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
||||||
"\n",
|
"\n",
|
||||||
"MILVUS_URL = os.environ.get(\n",
|
"MILVUS_URI = os.environ.get(\n",
|
||||||
" \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
|
" \"MILVUS_URI\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
|
||||||
")\n",
|
")\n",
|
||||||
"MILVUS_COLL_NAME = os.environ.get(\"MILVUS_COLL_NAME\", \"basic_llamaindex_pipeline\")\n",
|
|
||||||
"MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get(\"MILVUS_KWARGS\", \"{}\"))\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"vector_store = MilvusVectorStore(\n",
|
"vector_store = MilvusVectorStore(\n",
|
||||||
" uri=MILVUS_URL,\n",
|
" uri=MILVUS_URI,\n",
|
||||||
" collection_name=MILVUS_COLL_NAME,\n",
|
" collection_name=\"docling_li_demo\",\n",
|
||||||
" dim=len(embed_model.get_text_embedding(\"hi\")),\n",
|
" dim=len(embed_model.get_text_embedding(\"hi\")),\n",
|
||||||
" overwrite=INGEST,\n",
|
" overwrite=INGEST,\n",
|
||||||
" **MILVUS_KWARGS,\n",
|
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 13,
|
"execution_count": 14,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">[</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Document</span><span style=\"font-weight: bold\">(</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'83f7b6f1-33e3-493f-8240-95662a93d4dc'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">}</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[]</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[]</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{}</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">173793</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">)</span>\n",
|
|
||||||
"<span style=\"font-weight: bold\">]</span>\n",
|
|
||||||
"</pre>\n"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
"\u001b[1m[\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ \u001b[0m\u001b[1;35mDocument\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"_name\":\"\",\"type\":\"pdf-document\",\"description\":\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"'\u001b[0m+\u001b[1;36m173793\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ \u001b[0m\u001b[1m)\u001b[0m\n",
|
|
||||||
"\u001b[1m]\u001b[0m\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from llama_index.core import StorageContext, VectorStoreIndex\n",
|
"from llama_index.core import StorageContext, VectorStoreIndex\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -441,7 +413,6 @@
|
|||||||
" docs = reader.load_data(\n",
|
" docs = reader.load_data(\n",
|
||||||
" file_path=\"https://arxiv.org/pdf/2206.01062\", # DocLayNet paper\n",
|
" file_path=\"https://arxiv.org/pdf/2206.01062\", # DocLayNet paper\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" pprint(docs, max_length=1, max_string=50, max_depth=4)\n",
|
|
||||||
" storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
|
" storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
|
||||||
" index = VectorStoreIndex.from_documents(\n",
|
" index = VectorStoreIndex.from_documents(\n",
|
||||||
" documents=docs,\n",
|
" documents=docs,\n",
|
||||||
@ -466,7 +437,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 15,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -489,239 +460,42 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": 16,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"name": "stdout",
|
||||||
"text/html": [
|
"output_type": "stream",
|
||||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Response</span><span style=\"font-weight: bold\">(</span>\n",
|
"text": [
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">response</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'80863 pages were annotated by humans.'</span>,\n",
|
"Question:\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">source_nodes</span>=<span style=\"font-weight: bold\">[</span>\n",
|
"How many pages were annotated by humans?\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">NodeWithScore</span><span style=\"font-weight: bold\">(</span>\n",
|
"\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextNode</span><span style=\"font-weight: bold\">(</span>\n",
|
"Answer:\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'79ee790b-73d8-4268-90d7-301b5cd5e8f4'</span>,\n",
|
"\"80863 pages were annotated by humans.\"\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
"\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span>\n",
|
"Source 1:\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n",
|
" text: \"DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and ...\"\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span>,\n",
|
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[36]'</span>,\n",
|
" path: $.main-text[37]\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>,\n",
|
" page: 2\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>: <span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">317.11236572265625</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">116.19312286376953</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">559.7131958007812</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">202.27523803710938</span><span style=\"font-weight: bold\">]</span>\n",
|
" bbox: [317.2852478027344, 116.46983337402344, 559.7131958007812, 201.73675537109375]\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">}</span>,\n",
|
" heading: 3 THE DOCLAYNET DATASET\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">]</span>,\n",
|
"\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">]</span>,\n",
|
"Source 2:\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{</span>\n",
|
" text: \"In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-tr...\"\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">NodeRelationship.SOURCE:</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
" dl_doc_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'83f7b6f1-33e3-493f-8240-95662a93d4dc'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
" path: $.main-text[23]\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.DOCUMENT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'4'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
" page: 2\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
" bbox: [53.50020980834961, 212.36782836914062, 295.56396484375, 286.4964599609375]\n",
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'</span>\n",
|
" heading: 1 INTRODUCTION\n"
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
]
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.PREVIOUS: </span><span style=\"color: #008000; text-decoration-color: #008000\">'2'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'5509c0ef-2890-4bba-aa0f-82c0c389a621'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'d2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.NEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'3'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'47f51f1f-e92f-4d82-b36e-466fa62f8e34'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'</span><span style=\"color: #000000; text-decoration-color: #000000\">+</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">296</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.8344892859458923</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">NodeWithScore</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextNode</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'e1585b75-17f1-42b1-882a-f44e6ae4d382'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[75]'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">53.26631546020508</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">86.24749755859375</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">295.562255859375</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">215.95584106445312</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">]</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000\"> +</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">]</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000\"> +</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">]</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.SOURCE: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'83f7b6f1-33e3-493f-8240-95662a93d4dc'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.DOCUMENT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'4'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.PREVIOUS: </span><span style=\"color: #008000; text-decoration-color: #008000\">'2'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'964511c8-a412-47c4-8a3d-e4bf92edbda4'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.NEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'3'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"font-weight: bold\">></span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">}</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"font-weight: bold\">)</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">}</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">564</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"font-weight: bold\">)</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.8309065699577332</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">]</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'79ee790b-73d8-4268-90d7-301b5cd5e8f4'</span>: <span style=\"font-weight: bold\">{</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[36]'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>: <span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">317.11236572265625</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">116.19312286376953</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">559.7131958007812</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">202.27523803710938</span><span style=\"font-weight: bold\">]</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">}</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'e1585b75-17f1-42b1-882a-f44e6ae4d382'</span>: <span style=\"font-weight: bold\">{</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[75]'</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>,\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>: <span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">53.26631546020508</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">86.24749755859375</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">295.562255859375</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">215.95584106445312</span><span style=\"font-weight: bold\">]</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">}</span>\n",
|
|
||||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">}</span>\n",
|
|
||||||
"<span style=\"font-weight: bold\">)</span>\n",
|
|
||||||
"</pre>\n"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
"\u001b[1;35mResponse\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80863 pages were annotated by humans.'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ \u001b[0m\u001b[33msource_nodes\u001b[0m=\u001b[1m[\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m<\u001b[0m\u001b[1;95mNodeRelationship.SOURCE:\u001b[0m\u001b[39m \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.DOCUMENT: \u001b[0m\u001b[32m'4'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.PREVIOUS: \u001b[0m\u001b[32m'2'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'5509c0ef-2890-4bba-aa0f-82c0c389a621'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'd2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.NEXT: \u001b[0m\u001b[32m'3'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'47f51f1f-e92f-4d82-b36e-466fa62f8e34'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'\u001b[0m\u001b[39m+\u001b[0m\u001b[1;36m296\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'text/plain'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\\n'\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m\u001b[39m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;36m.8344892859458923\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m\u001b[39m=\u001b[0m\u001b[1;35mTextNode\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'e1585b75-17f1-42b1-882a-f44e6ae4d382'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m5\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m[\u001b[0m\u001b[1;36m53.26631546020508\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m86.24749755859375\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m295.562255859375\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m215.95584106445312\u001b[0m\u001b[1;39m]\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m, \u001b[0m\u001b[33m...\u001b[0m\u001b[39m +\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m, \u001b[0m\u001b[33m...\u001b[0m\u001b[39m +\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.SOURCE: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.DOCUMENT: \u001b[0m\u001b[32m'4'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.PREVIOUS: \u001b[0m\u001b[32m'2'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'964511c8-a412-47c4-8a3d-e4bf92edbda4'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.NEXT: \u001b[0m\u001b[32m'3'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd'\u001b[0m\u001b[39m,\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[1m>\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m=\u001b[32m'09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m)\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'\u001b[0m+\u001b[1;36m564\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.8309065699577332\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m: \u001b[1m{\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[32m'e1585b75-17f1-42b1-882a-f44e6ae4d382'\u001b[0m: \u001b[1m{\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m5\u001b[0m,\n",
|
|
||||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m53.26631546020508\u001b[0m, \u001b[1;36m86.24749755859375\u001b[0m, \u001b[1;36m295.562255859375\u001b[0m, \u001b[1;36m215.95584106445312\u001b[0m\u001b[1m]\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
"\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n",
|
|
||||||
"\u001b[1m)\u001b[0m\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"query_engine = index.as_query_engine(llm=llm)\n",
|
"query_engine = index.as_query_engine(llm=llm)\n",
|
||||||
"query_res = query_engine.query(\"How many pages were annotated by humans?\")\n",
|
"QUERY = \"How many pages were annotated by humans?\"\n",
|
||||||
"pprint(query_res, max_length=5, max_string=250, max_depth=6)"
|
"query_res = query_engine.query(QUERY)\n",
|
||||||
|
"print_qa(query=QUERY, query_res=query_res)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
82
poetry.lock
generated
82
poetry.lock
generated
@ -947,13 +947,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-core"
|
name = "docling-core"
|
||||||
version = "1.6.2"
|
version = "1.7.0"
|
||||||
description = "A python library to define and validate data types in Docling."
|
description = "A python library to define and validate data types in Docling."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.9"
|
python-versions = "<4.0,>=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "docling_core-1.6.2-py3-none-any.whl", hash = "sha256:1473ab13910d76552015c10fe351b90079a00c225f76ada3cd4fc7442183ffd0"},
|
{file = "docling_core-1.7.0-py3-none-any.whl", hash = "sha256:f4b94d1c21bbf4f7e945609d0ac2253ba29cfd120be08785440ee8d19118dd8b"},
|
||||||
{file = "docling_core-1.6.2.tar.gz", hash = "sha256:63f2b8a683dec56568ee1cd7d25cea419c0291211a88a11f74079ff2d62ccd5e"},
|
{file = "docling_core-1.7.0.tar.gz", hash = "sha256:edebd5ebfef1478782dccae50f236f364f8af02d21789790a3bbb8ddb26b42d3"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -2305,22 +2305,49 @@ files = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "langchain-core"
|
name = "langchain"
|
||||||
version = "0.2.40"
|
version = "0.3.2"
|
||||||
description = "Building applications with LLMs through composability"
|
description = "Building applications with LLMs through composability"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.8.1"
|
python-versions = "<4.0,>=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "langchain_core-0.2.40-py3-none-any.whl", hash = "sha256:71fff5cafa4b9c82a3a716e985f071383be452c35d8cc3169b3a393e6857fc99"},
|
{file = "langchain-0.3.2-py3-none-any.whl", hash = "sha256:cf005dcba132e46fb5e8d3dfaf7f8751bffd2d73e738c36be58f41edc7e3a4b8"},
|
||||||
{file = "langchain_core-0.2.40.tar.gz", hash = "sha256:c838ea0c0b73475a8e58ced3e306b6d926ef063721abd164f237c8664916f502"},
|
{file = "langchain-0.3.2.tar.gz", hash = "sha256:dc330e6eb10d81d23ba0305d18358702c73cc59e95c410eca6c6779aab4ddc9b"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
aiohttp = ">=3.8.3,<4.0.0"
|
||||||
|
async-timeout = {version = ">=4.0.0,<5.0.0", markers = "python_version < \"3.11\""}
|
||||||
|
langchain-core = ">=0.3.8,<0.4.0"
|
||||||
|
langchain-text-splitters = ">=0.3.0,<0.4.0"
|
||||||
|
langsmith = ">=0.1.17,<0.2.0"
|
||||||
|
numpy = [
|
||||||
|
{version = ">=1,<2", markers = "python_version < \"3.12\""},
|
||||||
|
{version = ">=1.26.0,<2.0.0", markers = "python_version >= \"3.12\""},
|
||||||
|
]
|
||||||
|
pydantic = ">=2.7.4,<3.0.0"
|
||||||
|
PyYAML = ">=5.3"
|
||||||
|
requests = ">=2,<3"
|
||||||
|
SQLAlchemy = ">=1.4,<3"
|
||||||
|
tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "langchain-core"
|
||||||
|
version = "0.3.8"
|
||||||
|
description = "Building applications with LLMs through composability"
|
||||||
|
optional = false
|
||||||
|
python-versions = "<4.0,>=3.9"
|
||||||
|
files = [
|
||||||
|
{file = "langchain_core-0.3.8-py3-none-any.whl", hash = "sha256:07015f7b1d9f52eefe05130e8cafe4dcbdbbf72a8411c9edafe38422e4d11b5c"},
|
||||||
|
{file = "langchain_core-0.3.8.tar.gz", hash = "sha256:7485904f7082f1df880d5ae470a488161616132f30d99f556a1877901fffd1cb"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
jsonpatch = ">=1.33,<2.0"
|
jsonpatch = ">=1.33,<2.0"
|
||||||
langsmith = ">=0.1.112,<0.2.0"
|
langsmith = ">=0.1.125,<0.2.0"
|
||||||
packaging = ">=23.2,<25"
|
packaging = ">=23.2,<25"
|
||||||
pydantic = [
|
pydantic = [
|
||||||
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
|
{version = ">=2.5.2,<3.0.0", markers = "python_full_version < \"3.12.4\""},
|
||||||
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
||||||
]
|
]
|
||||||
PyYAML = ">=5.3"
|
PyYAML = ">=5.3"
|
||||||
@ -2329,18 +2356,18 @@ typing-extensions = ">=4.7"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "langchain-huggingface"
|
name = "langchain-huggingface"
|
||||||
version = "0.0.3"
|
version = "0.1.0"
|
||||||
description = "An integration package connecting Hugging Face and LangChain"
|
description = "An integration package connecting Hugging Face and LangChain"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.8.1"
|
python-versions = "<4.0,>=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "langchain_huggingface-0.0.3-py3-none-any.whl", hash = "sha256:d6827adf3c7c8fcc0bca8c43c7e900c3bf68af9a1532a83d4b8ace137e02887e"},
|
{file = "langchain_huggingface-0.1.0-py3-none-any.whl", hash = "sha256:1b3dc44f460ba205f7c13cf48379a7f809ac9e056ea741bd12fc6414ac6aefb7"},
|
||||||
{file = "langchain_huggingface-0.0.3.tar.gz", hash = "sha256:0637acf484c47323cf3dcc46745a93467f6955989af9b7c01e2382fe1b630aaf"},
|
{file = "langchain_huggingface-0.1.0.tar.gz", hash = "sha256:fa77a9545de77e7be4a7de134f692da413fc3b2b99d364fc9bcd0c6fa6fcb35c"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
huggingface-hub = ">=0.23.0"
|
huggingface-hub = ">=0.23.0"
|
||||||
langchain-core = ">=0.1.52,<0.3"
|
langchain-core = ">=0.3.0,<0.4"
|
||||||
sentence-transformers = ">=2.6.0"
|
sentence-transformers = ">=2.6.0"
|
||||||
tokenizers = ">=0.19.1"
|
tokenizers = ">=0.19.1"
|
||||||
transformers = ">=4.39.0"
|
transformers = ">=4.39.0"
|
||||||
@ -2366,27 +2393,27 @@ scipy = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "langchain-text-splitters"
|
name = "langchain-text-splitters"
|
||||||
version = "0.2.4"
|
version = "0.3.0"
|
||||||
description = "LangChain text splitting utilities"
|
description = "LangChain text splitting utilities"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.8.1"
|
python-versions = "<4.0,>=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "langchain_text_splitters-0.2.4-py3-none-any.whl", hash = "sha256:2702dee5b7cbdd595ccbe43b8d38d01a34aa8583f4d6a5a68ad2305ae3e7b645"},
|
{file = "langchain_text_splitters-0.3.0-py3-none-any.whl", hash = "sha256:e84243e45eaff16e5b776cd9c81b6d07c55c010ebcb1965deb3d1792b7358e83"},
|
||||||
{file = "langchain_text_splitters-0.2.4.tar.gz", hash = "sha256:f7daa7a3b0aa8309ce248e2e2b6fc8115be01118d336c7f7f7dfacda0e89bf29"},
|
{file = "langchain_text_splitters-0.3.0.tar.gz", hash = "sha256:f9fe0b4d244db1d6de211e7343d4abc4aa90295aa22e1f0c89e51f33c55cd7ce"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
langchain-core = ">=0.2.38,<0.3.0"
|
langchain-core = ">=0.3.0,<0.4.0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "langsmith"
|
name = "langsmith"
|
||||||
version = "0.1.121"
|
version = "0.1.131"
|
||||||
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
|
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.8.1"
|
python-versions = "<4.0,>=3.8.1"
|
||||||
files = [
|
files = [
|
||||||
{file = "langsmith-0.1.121-py3-none-any.whl", hash = "sha256:fdb1ac8a671d3904201bfeea197d87bded46a10d08f1034af464211872e29893"},
|
{file = "langsmith-0.1.131-py3-none-any.whl", hash = "sha256:80c106b1c42307195cc0bb3a596472c41ef91b79d15bcee9938307800336c563"},
|
||||||
{file = "langsmith-0.1.121.tar.gz", hash = "sha256:e9381b82a5bd484af9a51c3e96faea572746b8d617b070c1cda40cbbe48e33df"},
|
{file = "langsmith-0.1.131.tar.gz", hash = "sha256:626101a3bf3ca481e5110d5155ace8aa066e4e9cc2fa7d96c8290ade0fbff797"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -2397,6 +2424,7 @@ pydantic = [
|
|||||||
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
||||||
]
|
]
|
||||||
requests = ">=2,<3"
|
requests = ">=2,<3"
|
||||||
|
requests-toolbelt = ">=1.0.0,<2.0.0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lazy-loader"
|
name = "lazy-loader"
|
||||||
@ -4324,13 +4352,13 @@ testutils = ["gitpython (>3)"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pymilvus"
|
name = "pymilvus"
|
||||||
version = "2.4.6"
|
version = "2.4.7"
|
||||||
description = "Python Sdk for Milvus"
|
description = "Python Sdk for Milvus"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "pymilvus-2.4.6-py3-none-any.whl", hash = "sha256:b4c43472edc313b845d313be50610e19054e6954b2c5c3b515565c596c2d3d97"},
|
{file = "pymilvus-2.4.7-py3-none-any.whl", hash = "sha256:1e5d377bd40fa7eb459d3958dbd96201758f5cf997d41eb3d2d169d0b7fa462e"},
|
||||||
{file = "pymilvus-2.4.6.tar.gz", hash = "sha256:6ac3eb91c92cc01bbe444fe83f895f02d7b2546d96ac67998630bf31ac074d66"},
|
{file = "pymilvus-2.4.7.tar.gz", hash = "sha256:9ef460b940782a42e1b7b8ae0da03d8cc02d9d80044d13f4b689a7c935ec7aa7"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -7124,4 +7152,4 @@ type = ["pytest-mypy"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "7c5fb235944009b74193d045f36c1be2a8e168393012bf952541e6e7dea08072"
|
content-hash = "429aab5a8ab3e6914ff34386c3b20c88ed019066da362567d2b8d1306cc698cc"
|
||||||
|
@ -77,9 +77,10 @@ python-dotenv = "^1.0.1"
|
|||||||
llama-index-embeddings-huggingface = "^0.3.1"
|
llama-index-embeddings-huggingface = "^0.3.1"
|
||||||
llama-index-llms-huggingface-api = "^0.2.0"
|
llama-index-llms-huggingface-api = "^0.2.0"
|
||||||
llama-index-vector-stores-milvus = "^0.2.1"
|
llama-index-vector-stores-milvus = "^0.2.1"
|
||||||
langchain-huggingface = "^0.0.3"
|
langchain = "^0.3.2"
|
||||||
langchain-milvus = "^0.1.4"
|
langchain-huggingface = "^0.1.0"
|
||||||
langchain-text-splitters = "^0.2.4"
|
langchain-milvus = "^0.1.5"
|
||||||
|
langchain-text-splitters = "^0.3.0"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
docling = "docling.cli.main:app"
|
docling = "docling.cli.main:app"
|
||||||
|
Loading…
Reference in New Issue
Block a user