mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
chore: showcase extended metadata in LlamaIndex example
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
9b82ae3324
commit
f4ee76eaec
@ -75,16 +75,17 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Reader and node parser"
|
||||
"### Helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below we set up:\n",
|
||||
"- a `Reader` which will be used to create LlamaIndex documents, and\n",
|
||||
"- a `NodeParser`, which will be used to create LlamaIndex nodes out of the documents"
|
||||
"Below we define:\n",
|
||||
"\n",
|
||||
"- `DoclingPDFReader` which will be used to create LlamaIndex documents, and\n",
|
||||
"- `HierarchicalJSONNodeParser`, which can be used to create LlamaIndex nodes out of JSON-based documents\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -94,50 +95,54 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from enum import Enum\n",
|
||||
"from typing import Iterable\n",
|
||||
"from pathlib import Path\n",
|
||||
"from typing import Any, Iterable\n",
|
||||
"\n",
|
||||
"from llama_index.core.readers.base import BasePydanticReader\n",
|
||||
"from llama_index.core.schema import Document as LIDocument\n",
|
||||
"from pydantic import BaseModel\n",
|
||||
"\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class DocumentMetadata(BaseModel):\n",
|
||||
" dl_doc_hash: str\n",
|
||||
"\n",
|
||||
"class DocMetaKeys(str, Enum):\n",
|
||||
" DL_DOC_HASH = \"dl_doc_hash\"\n",
|
||||
" ORIGIN = \"origin\"\n",
|
||||
"\n",
|
||||
"class DoclingPDFReader(BasePydanticReader):\n",
|
||||
" class ParseType(str, Enum):\n",
|
||||
" MARKDOWN = \"markdown\"\n",
|
||||
" # JSON = \"json\"\n",
|
||||
" JSON = \"json\"\n",
|
||||
"\n",
|
||||
" parse_type: ParseType = ParseType.MARKDOWN\n",
|
||||
"\n",
|
||||
" def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]:\n",
|
||||
" file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
|
||||
" def lazy_load_data(\n",
|
||||
" self,\n",
|
||||
" file_path: str | Path | Iterable[str] | Iterable[Path],\n",
|
||||
" *args: Any,\n",
|
||||
" **load_kwargs: Any,\n",
|
||||
" ) -> Iterable[LIDocument]:\n",
|
||||
" file_paths = (\n",
|
||||
" file_path\n",
|
||||
" if isinstance(file_path, Iterable) and not isinstance(file_path, str)\n",
|
||||
" else [file_path]\n",
|
||||
" )\n",
|
||||
" converter = DocumentConverter()\n",
|
||||
" for source in file_paths:\n",
|
||||
" dl_doc = converter.convert_single(source).output\n",
|
||||
" match self.parse_type:\n",
|
||||
" case self.ParseType.MARKDOWN:\n",
|
||||
" text = dl_doc.export_to_markdown()\n",
|
||||
" # case self.ParseType.JSON:\n",
|
||||
" # text = dl_doc.model_dump_json()\n",
|
||||
" case self.ParseType.JSON:\n",
|
||||
" text = dl_doc.model_dump_json()\n",
|
||||
" case _:\n",
|
||||
" raise RuntimeError(\n",
|
||||
" f\"Unexpected parse type encountered: {self.parse_type}\"\n",
|
||||
" f\"Unexpected export type encountered: {self.export_type}\"\n",
|
||||
" )\n",
|
||||
" excl_metadata_keys = [\"dl_doc_hash\"]\n",
|
||||
" li_doc = LIDocument(\n",
|
||||
" doc_id=dl_doc.file_info.document_hash,\n",
|
||||
" text=text,\n",
|
||||
" excluded_embed_metadata_keys=excl_metadata_keys,\n",
|
||||
" excluded_llm_metadata_keys=excl_metadata_keys,\n",
|
||||
" )\n",
|
||||
" li_doc.metadata = DocumentMetadata(\n",
|
||||
" dl_doc_hash=dl_doc.file_info.document_hash,\n",
|
||||
" ).model_dump()\n",
|
||||
" origin = str(source) if isinstance(source, Path) else source\n",
|
||||
" li_doc = LIDocument(text=text)\n",
|
||||
" li_doc.metadata = {\n",
|
||||
" DocMetaKeys.DL_DOC_HASH: dl_doc.file_info.document_hash,\n",
|
||||
" DocMetaKeys.ORIGIN: origin,\n",
|
||||
" }\n",
|
||||
" yield li_doc"
|
||||
]
|
||||
},
|
||||
@ -147,10 +152,143 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core.node_parser import MarkdownNodeParser\n",
|
||||
"from typing import Any, Iterable, Sequence\n",
|
||||
"\n",
|
||||
"reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n",
|
||||
"node_parser = MarkdownNodeParser()\n",
|
||||
"from docling_core.transforms.chunker import ChunkWithMetadata, HierarchicalChunker\n",
|
||||
"from docling_core.types import Document as DLDocument\n",
|
||||
"from llama_index.core import Document as LIDocument\n",
|
||||
"from llama_index.core.node_parser.interface import NodeParser\n",
|
||||
"from llama_index.core.schema import (\n",
|
||||
" BaseNode,\n",
|
||||
" NodeRelationship,\n",
|
||||
" RelatedNodeType,\n",
|
||||
" TextNode,\n",
|
||||
")\n",
|
||||
"from llama_index.core.utils import get_tqdm_iterable\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class NodeMetaKeys(str, Enum):\n",
|
||||
" PATH = \"path\"\n",
|
||||
" PAGE = \"page\"\n",
|
||||
" BBOX = \"bbox\"\n",
|
||||
" ORIGIN = \"origin\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class HierarchicalJSONNodeParser(NodeParser):\n",
|
||||
"\n",
|
||||
" def _parse_nodes(\n",
|
||||
" self,\n",
|
||||
" nodes: Sequence[BaseNode],\n",
|
||||
" show_progress: bool = False,\n",
|
||||
" **kwargs: Any,\n",
|
||||
" ) -> list[BaseNode]:\n",
|
||||
" nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(\n",
|
||||
" items=nodes, show_progress=show_progress, desc=\"Parsing nodes\"\n",
|
||||
" )\n",
|
||||
" all_nodes: list[BaseNode] = []\n",
|
||||
" chunker = HierarchicalChunker()\n",
|
||||
" for input_node in nodes_with_progress:\n",
|
||||
" li_doc = LIDocument.model_validate(input_node)\n",
|
||||
" dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())\n",
|
||||
" chunk_iter = chunker.chunk(dl_doc=dl_doc)\n",
|
||||
" for chunk in chunk_iter:\n",
|
||||
" rels: dict[NodeRelationship, RelatedNodeType] = {\n",
|
||||
" NodeRelationship.SOURCE: li_doc.as_related_node_info(),\n",
|
||||
" }\n",
|
||||
" excl_doc_meta_keys = [d.value for d in DocMetaKeys]\n",
|
||||
" excl_node_meta_keys = [n.value for n in NodeMetaKeys]\n",
|
||||
" excl_meta_keys = excl_doc_meta_keys + excl_node_meta_keys\n",
|
||||
" node = TextNode(\n",
|
||||
" text=chunk.text,\n",
|
||||
" excluded_embed_metadata_keys=excl_meta_keys,\n",
|
||||
" excluded_llm_metadata_keys=excl_meta_keys,\n",
|
||||
" relationships=rels,\n",
|
||||
" )\n",
|
||||
" node.metadata = {NodeMetaKeys.PATH: chunk.path}\n",
|
||||
" if isinstance(chunk, ChunkWithMetadata):\n",
|
||||
" node.metadata[NodeMetaKeys.PAGE] = chunk.page\n",
|
||||
" node.metadata[NodeMetaKeys.BBOX] = chunk.bbox\n",
|
||||
" all_nodes.append(node)\n",
|
||||
" return all_nodes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Reader and node parser"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Using JSON"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To leverage Docling's rich document structure format, we can namely export to JSON and use the HierarchicalJSONNodeParser accordingly:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)\n",
|
||||
"node_parser = HierarchicalJSONNodeParser()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Using Markdown"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# from llama_index.core.node_parser import MarkdownNodeParser\n",
|
||||
"\n",
|
||||
"# reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n",
|
||||
"# node_parser = MarkdownNodeParser()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Transformations"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Our transformations currently include the `node_parser`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"transformations = [node_parser]"
|
||||
]
|
||||
},
|
||||
@ -163,7 +301,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -171,7 +309,7 @@
|
||||
"\n",
|
||||
"# splitter = TokenTextSplitter(\n",
|
||||
"# chunk_size=1024,\n",
|
||||
"# chunk_overlap=20,\n",
|
||||
"# chunk_overlap=0,\n",
|
||||
"# )\n",
|
||||
"# transformations.append(splitter)"
|
||||
]
|
||||
@ -185,13 +323,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
|
||||
"\n",
|
||||
"embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")"
|
||||
"embed_model = HuggingFaceEmbedding(model_name=\"intfloat/multilingual-e5-small\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -203,7 +341,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -212,7 +350,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -234,6 +372,7 @@
|
||||
")\n",
|
||||
"MILVUS_COLL_NAME = os.environ.get(\"MILVUS_COLL_NAME\", \"basic_llamaindex_pipeline\")\n",
|
||||
"MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get(\"MILVUS_KWARGS\", \"{}\"))\n",
|
||||
"\n",
|
||||
"vector_store = MilvusVectorStore(\n",
|
||||
" uri=MILVUS_URL,\n",
|
||||
" collection_name=MILVUS_COLL_NAME,\n",
|
||||
@ -245,35 +384,21 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "536daee038de4d52a793445c6d853c72",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">[</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Document</span><span style=\"font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'83f7b6f1-33e3-493f-8240-95662a93d4dc'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span><span style=\"font-weight: bold\">}</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"font-weight: bold\">]</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"font-weight: bold\">]</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">}</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[]</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[]</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{}</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'## DocLayNet: A Large Human-Annotated Dataset for '</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">50593</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">173793</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
||||
@ -287,13 +412,13 @@
|
||||
"text/plain": [
|
||||
"\u001b[1m[\u001b[0m\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[1;35mDocument\u001b[0m\u001b[1m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m\u001b[1m}\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'## DocLayNet: A Large Human-Annotated Dataset for '\u001b[0m+\u001b[1;36m50593\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"_name\":\"\",\"type\":\"pdf-document\",\"description\":\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"'\u001b[0m+\u001b[1;36m173793\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
||||
@ -341,7 +466,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -364,75 +489,227 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Response</span><span style=\"font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">response</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'80863 pages were human annotated.'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">response</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'80863 pages were annotated by humans.'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">source_nodes</span>=<span style=\"font-weight: bold\">[</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">NodeWithScore</span><span style=\"font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextNode</span><span style=\"font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'8874a117-d181-4f4f-a30b-0b5604370d77'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'79ee790b-73d8-4268-90d7-301b5cd5e8f4'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">}</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">]</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">]</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">}</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'3 THE DOCLAYNET DATASET\\n\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape o'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5775</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[36]'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>: <span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">317.11236572265625</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">116.19312286376953</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">559.7131958007812</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">202.27523803710938</span><span style=\"font-weight: bold\">]</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">}</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">]</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>, <span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">]</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">NodeRelationship.SOURCE:</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'83f7b6f1-33e3-493f-8240-95662a93d4dc'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.DOCUMENT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'4'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.PREVIOUS: </span><span style=\"color: #008000; text-decoration-color: #008000\">'2'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'5509c0ef-2890-4bba-aa0f-82c0c389a621'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'d2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.NEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'3'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'47f51f1f-e92f-4d82-b36e-466fa62f8e34'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'</span><span style=\"color: #000000; text-decoration-color: #000000\">+</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">296</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.8344892859458923</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">NodeWithScore</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextNode</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'e1585b75-17f1-42b1-882a-f44e6ae4d382'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[75]'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span><span style=\"color: #000000; text-decoration-color: #000000\">: </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">53.26631546020508</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">86.24749755859375</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">295.562255859375</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">215.95584106445312</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">]</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000\"> +</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">]</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span><span style=\"color: #000000; text-decoration-color: #000000\">, </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000\"> +</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">]</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.SOURCE: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'83f7b6f1-33e3-493f-8240-95662a93d4dc'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.DOCUMENT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'4'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.PREVIOUS: </span><span style=\"color: #008000; text-decoration-color: #008000\">'2'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'964511c8-a412-47c4-8a3d-e4bf92edbda4'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"color: #000000; text-decoration-color: #000000\">>,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">}</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #000000; text-decoration-color: #000000\"><NodeRelationship.NEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'3'</span><span style=\"color: #000000; text-decoration-color: #000000\">>: </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RelatedNodeInfo</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_id</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node_type</span><span style=\"color: #000000; text-decoration-color: #000000\">=<ObjectType.TEXT: </span><span style=\"color: #008000; text-decoration-color: #008000\">'1'</span><span style=\"font-weight: bold\">></span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">}</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"font-weight: bold\">)</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">}</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">564</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">9089</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">15114</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"font-weight: bold\">)</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.7367570400238037</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.8309065699577332</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">]</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'8874a117-d181-4f4f-a30b-0b5604370d77'</span>: <span style=\"font-weight: bold\">{</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'79ee790b-73d8-4268-90d7-301b5cd5e8f4'</span>: <span style=\"font-weight: bold\">{</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[36]'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>: <span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">317.11236572265625</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">116.19312286376953</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">559.7131958007812</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">202.27523803710938</span><span style=\"font-weight: bold\">]</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">}</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'e1585b75-17f1-42b1-882a-f44e6ae4d382'</span>: <span style=\"font-weight: bold\">{</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'origin'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'https://arxiv.org/pdf/2206.01062'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'$.main-text[75]'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'page'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'bbox'</span>: <span style=\"font-weight: bold\">[</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">53.26631546020508</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">86.24749755859375</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">295.562255859375</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">215.95584106445312</span><span style=\"font-weight: bold\">]</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">}</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">}</span>\n",
|
||||
"<span style=\"font-weight: bold\">)</span>\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1;35mResponse\u001b[0m\u001b[1m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80863 pages were human annotated.'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80863 pages were annotated by humans.'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[33msource_nodes\u001b[0m=\u001b[1m[\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'8874a117-d181-4f4f-a30b-0b5604370d77'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[33m...\u001b[0m\u001b[1m]\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[33m...\u001b[0m\u001b[1m]\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'3 THE DOCLAYNET DATASET\\n\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape o'\u001b[0m+\u001b[1;36m5775\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m, \u001b[32m'origin'\u001b[0m, \u001b[32m'path'\u001b[0m, \u001b[32m'page'\u001b[0m, \u001b[32m'bbox'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m]\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m<\u001b[0m\u001b[1;95mNodeRelationship.SOURCE:\u001b[0m\u001b[39m \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.DOCUMENT: \u001b[0m\u001b[32m'4'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.PREVIOUS: \u001b[0m\u001b[32m'2'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'5509c0ef-2890-4bba-aa0f-82c0c389a621'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'd2593a3a6590fdbc8c1ce8cdb8c0a30f1305d1dcde2ec42d564cff772e10cba7'\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.NEXT: \u001b[0m\u001b[32m'3'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'47f51f1f-e92f-4d82-b36e-466fa62f8e34'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'df1e56242d89ec477ed088de11f8bb175f091ae62926228530ebefd3a2b260b4'\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'\u001b[0m\u001b[39m+\u001b[0m\u001b[1;36m296\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'text/plain'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'\\n'\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m\u001b[39m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;36m.8344892859458923\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m\u001b[39m=\u001b[0m\u001b[1;35mTextNode\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'e1585b75-17f1-42b1-882a-f44e6ae4d382'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m\u001b[39m=\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m5\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m[\u001b[0m\u001b[1;36m53.26631546020508\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m86.24749755859375\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m295.562255859375\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m215.95584106445312\u001b[0m\u001b[1;39m]\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m, \u001b[0m\u001b[33m...\u001b[0m\u001b[39m +\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'origin'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'path'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'page'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'bbox'\u001b[0m\u001b[39m, \u001b[0m\u001b[33m...\u001b[0m\u001b[39m +\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.SOURCE: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'83f7b6f1-33e3-493f-8240-95662a93d4dc'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.DOCUMENT: \u001b[0m\u001b[32m'4'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'10c71d271e8c332f43b561647f58aae7cbf5c8cdb380d0486c553cc72be5102f'\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.PREVIOUS: \u001b[0m\u001b[32m'2'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'964511c8-a412-47c4-8a3d-e4bf92edbda4'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[39m>,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m\u001b[39m=\u001b[0m\u001b[1;39m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'c753eb0a489b37f18e388ee07f2621d1ccca003300f961223659aebd14dceb09'\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[39m<NodeRelationship.NEXT: \u001b[0m\u001b[32m'3'\u001b[0m\u001b[39m>: \u001b[0m\u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1;39m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_id\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'9c4ed3fd-57a3-4ef1-bd0f-77d5b38e16cd'\u001b[0m\u001b[39m,\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mnode_type\u001b[0m\u001b[39m=<ObjectType.TEXT: \u001b[0m\u001b[32m'1'\u001b[0m\u001b[1m>\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[33mhash\u001b[0m=\u001b[32m'09eac4db77d2af009eceab4e76cdbe8ff44c6f51ca86405365d6dd5e95660646'\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m)\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'4 ANNOTATION CAMPAIGN\\nPhase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore'\u001b[0m+\u001b[1;36m564\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[1;36m9089\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[1;36m15114\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.7367570400238037\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.8309065699577332\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[32m'8874a117-d181-4f4f-a30b-0b5604370d77'\u001b[0m: \u001b[1m{\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[32m'79ee790b-73d8-4268-90d7-301b5cd5e8f4'\u001b[0m: \u001b[1m{\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m2\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m317.11236572265625\u001b[0m, \u001b[1;36m116.19312286376953\u001b[0m, \u001b[1;36m559.7131958007812\u001b[0m, \u001b[1;36m202.27523803710938\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[32m'e1585b75-17f1-42b1-882a-f44e6ae4d382'\u001b[0m: \u001b[1m{\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'origin'\u001b[0m: \u001b[32m'https://arxiv.org/pdf/2206.01062'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'page'\u001b[0m: \u001b[1;36m5\u001b[0m,\n",
|
||||
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'bbox'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m53.26631546020508\u001b[0m, \u001b[1;36m86.24749755859375\u001b[0m, \u001b[1;36m295.562255859375\u001b[0m, \u001b[1;36m215.95584106445312\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
"\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n",
|
||||
"\u001b[1m)\u001b[0m\n"
|
||||
]
|
||||
@ -443,8 +720,8 @@
|
||||
],
|
||||
"source": [
|
||||
"query_engine = index.as_query_engine(llm=llm)\n",
|
||||
"query_res = query_engine.query(\"How many pages were human annotated?\")\n",
|
||||
"pprint(query_res, max_length=1, max_string=250, max_depth=4)"
|
||||
"query_res = query_engine.query(\"How many pages were annotated by humans?\")\n",
|
||||
"pprint(query_res, max_length=5, max_string=250, max_depth=6)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
6
poetry.lock
generated
6
poetry.lock
generated
@ -2530,13 +2530,13 @@ llama-index-core = ">=0.11.0,<0.12.0"
|
||||
|
||||
[[package]]
|
||||
name = "llama-index-vector-stores-milvus"
|
||||
version = "0.2.3"
|
||||
version = "0.2.5"
|
||||
description = "llama-index vector_stores milvus integration"
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.8.1"
|
||||
files = [
|
||||
{file = "llama_index_vector_stores_milvus-0.2.3-py3-none-any.whl", hash = "sha256:287c3b2b8d886eac11b07db3ddf31b92dee55ac4f00fe7dc047879e2f7d79d67"},
|
||||
{file = "llama_index_vector_stores_milvus-0.2.3.tar.gz", hash = "sha256:3b2433869264f13aee752ee086f8741ea79222ebbaa0c437fa9fb21a8d56cdaf"},
|
||||
{file = "llama_index_vector_stores_milvus-0.2.5-py3-none-any.whl", hash = "sha256:020d96dad541ee80c480d7ed6fe7587020a8c8a20d54b320fa376ef4dd8c8648"},
|
||||
{file = "llama_index_vector_stores_milvus-0.2.5.tar.gz", hash = "sha256:b68093f0bf2654dd6436678bb182e1c7f3e549efc4bc9e0f10039bc6b73f7b53"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
Loading…
Reference in New Issue
Block a user