reformatted the code to pass the tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-11-05 07:29:45 +01:00
parent 4cab7d63db
commit 98efb8957e
2 changed files with 13 additions and 21 deletions

View File

@ -16,20 +16,16 @@
],
"source": [
"from typing import Iterator\n",
"\n",
"import lancedb\n",
"import semchunk\n",
"from docling_core.transforms.chunker import (\n",
" BaseChunk,\n",
" BaseChunker,\n",
" HierarchicalChunker\n",
")\n",
"from docling_core.transforms.chunker import BaseChunk, BaseChunker, HierarchicalChunker\n",
"from docling_core.types import DoclingDocument\n",
"from pydantic import PositiveInt\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"import lancedb\n",
"\n",
"from sentence_transformers import SentenceTransformer\n",
"from transformers import AutoTokenizer\n",
"from sentence_transformers import SentenceTransformer"
"\n",
"from docling.document_converter import DocumentConverter"
]
},
{
@ -321,9 +317,7 @@
" return t1 + \"\\n\" + t2\n",
"\n",
"\n",
"def split_by_doc_items(\n",
" doc_chunk: DocChunk, tokenizer, chunk_size: int\n",
"):\n",
"def split_by_doc_items(doc_chunk: DocChunk, tokenizer, chunk_size: int):\n",
" if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n",
" return [doc_chunk]\n",
" length = doc_chunk_length(doc_chunk, tokenizer)\n",
@ -618,9 +612,7 @@
"def adjust_chunks_for_fixed_size(doc, original_chunks, tokenizer, splitter, chunk_size):\n",
" chunks_after_splitting_by_items = []\n",
" for chunk in original_chunks:\n",
" chunk_split_by_doc_items = split_by_doc_items(\n",
" chunk, tokenizer, chunk_size\n",
" )\n",
" chunk_split_by_doc_items = split_by_doc_items(chunk, tokenizer, chunk_size)\n",
" chunks_after_splitting_by_items.extend(chunk_split_by_doc_items)\n",
" chunks_after_splitting_recursively = []\n",
" for chunk in chunks_after_splitting_by_items:\n",
@ -828,11 +820,11 @@
" output = \"\"\n",
" if chunk.meta.headings != None:\n",
" for h in chunk.meta.headings:\n",
" output += h + '\\n'\n",
" output += h + \"\\n\"\n",
" if chunk.meta.captions != None:\n",
" for c in chunk.meta.captions:\n",
" output += c + '\\n'\n",
" output += chunk.text \n",
" output += c + \"\\n\"\n",
" output += chunk.text\n",
" return output"
]
},
@ -854,7 +846,6 @@
}
],
"source": [
"\n",
"print(make_text_for_embedding(chunks[19]))"
]
},
@ -874,7 +865,7 @@
" \"vector\": embeddings,\n",
" \"text\": chunk.text,\n",
" \"headings\": chunk.meta.headings,\n",
" \"captions\": chunk.meta.captions\n",
" \"captions\": chunk.meta.captions,\n",
" }\n",
" data.append(data_item)\n",
" tbl = db.create_table(index_name, data=data, exist_ok=True)\n",

View File

@ -85,6 +85,7 @@
"\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"\n",
"class DoclingPDFLoader(BaseLoader):\n",
"\n",
" def __init__(self, file_path: str | list[str]) -> None:\n",