mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 22:14:37 +00:00
reformatted the code to pass the tests
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
4cab7d63db
commit
98efb8957e
@ -16,20 +16,16 @@
|
||||
],
|
||||
"source": [
|
||||
"from typing import Iterator\n",
|
||||
"\n",
|
||||
"import lancedb\n",
|
||||
"import semchunk\n",
|
||||
"from docling_core.transforms.chunker import (\n",
|
||||
" BaseChunk,\n",
|
||||
" BaseChunker,\n",
|
||||
" HierarchicalChunker\n",
|
||||
")\n",
|
||||
"from docling_core.transforms.chunker import BaseChunk, BaseChunker, HierarchicalChunker\n",
|
||||
"from docling_core.types import DoclingDocument\n",
|
||||
"from pydantic import PositiveInt\n",
|
||||
"\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"import lancedb\n",
|
||||
"\n",
|
||||
"from sentence_transformers import SentenceTransformer\n",
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"from sentence_transformers import SentenceTransformer"
|
||||
"\n",
|
||||
"from docling.document_converter import DocumentConverter"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -321,9 +317,7 @@
|
||||
" return t1 + \"\\n\" + t2\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def split_by_doc_items(\n",
|
||||
" doc_chunk: DocChunk, tokenizer, chunk_size: int\n",
|
||||
"):\n",
|
||||
"def split_by_doc_items(doc_chunk: DocChunk, tokenizer, chunk_size: int):\n",
|
||||
" if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n",
|
||||
" return [doc_chunk]\n",
|
||||
" length = doc_chunk_length(doc_chunk, tokenizer)\n",
|
||||
@ -618,9 +612,7 @@
|
||||
"def adjust_chunks_for_fixed_size(doc, original_chunks, tokenizer, splitter, chunk_size):\n",
|
||||
" chunks_after_splitting_by_items = []\n",
|
||||
" for chunk in original_chunks:\n",
|
||||
" chunk_split_by_doc_items = split_by_doc_items(\n",
|
||||
" chunk, tokenizer, chunk_size\n",
|
||||
" )\n",
|
||||
" chunk_split_by_doc_items = split_by_doc_items(chunk, tokenizer, chunk_size)\n",
|
||||
" chunks_after_splitting_by_items.extend(chunk_split_by_doc_items)\n",
|
||||
" chunks_after_splitting_recursively = []\n",
|
||||
" for chunk in chunks_after_splitting_by_items:\n",
|
||||
@ -828,11 +820,11 @@
|
||||
" output = \"\"\n",
|
||||
" if chunk.meta.headings != None:\n",
|
||||
" for h in chunk.meta.headings:\n",
|
||||
" output += h + '\\n'\n",
|
||||
" output += h + \"\\n\"\n",
|
||||
" if chunk.meta.captions != None:\n",
|
||||
" for c in chunk.meta.captions:\n",
|
||||
" output += c + '\\n'\n",
|
||||
" output += chunk.text \n",
|
||||
" output += c + \"\\n\"\n",
|
||||
" output += chunk.text\n",
|
||||
" return output"
|
||||
]
|
||||
},
|
||||
@ -854,7 +846,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"print(make_text_for_embedding(chunks[19]))"
|
||||
]
|
||||
},
|
||||
@ -874,7 +865,7 @@
|
||||
" \"vector\": embeddings,\n",
|
||||
" \"text\": chunk.text,\n",
|
||||
" \"headings\": chunk.meta.headings,\n",
|
||||
" \"captions\": chunk.meta.captions\n",
|
||||
" \"captions\": chunk.meta.captions,\n",
|
||||
" }\n",
|
||||
" data.append(data_item)\n",
|
||||
" tbl = db.create_table(index_name, data=data, exist_ok=True)\n",
|
||||
|
@ -85,6 +85,7 @@
|
||||
"\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class DoclingPDFLoader(BaseLoader):\n",
|
||||
"\n",
|
||||
" def __init__(self, file_path: str | list[str]) -> None:\n",
|
||||
|
Loading…
Reference in New Issue
Block a user