diff --git a/docs/examples/rag_langchain.ipynb b/docs/examples/rag_langchain.ipynb index f2464f29..31ff009a 100644 --- a/docs/examples/rag_langchain.ipynb +++ b/docs/examples/rag_langchain.ipynb @@ -49,18 +49,6 @@ "load_dotenv()" ] }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n", - "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -86,54 +74,37 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "from enum import Enum\n", "from typing import Iterator\n", "\n", "from langchain_core.document_loaders import BaseLoader\n", "from langchain_core.documents import Document as LCDocument\n", - "from pydantic import BaseModel\n", "\n", "from docling.document_converter import DocumentConverter\n", "\n", - "\n", - "class DocumentMetadata(BaseModel):\n", - " dl_doc_hash: str\n", - " # source: str\n", - "\n", - "\n", "class DoclingPDFLoader(BaseLoader):\n", - " class ParseType(str, Enum):\n", - " MARKDOWN = \"markdown\"\n", - " # JSON = \"json\"\n", "\n", - " def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n", + " def __init__(self, file_path: str | list[str]) -> None:\n", " self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n", - " self._parse_type = parse_type\n", " self._converter = DocumentConverter()\n", "\n", " def lazy_load(self) -> Iterator[LCDocument]:\n", " for source in self._file_paths:\n", - " dl_doc = self._converter.convert_single(source).output\n", - " match self._parse_type:\n", - " case self.ParseType.MARKDOWN:\n", - " text = dl_doc.export_to_markdown()\n", - " # case self.ParseType.JSON:\n", - " # text = dl_doc.model_dump_json()\n", - " case _:\n", - " raise RuntimeError(\n", - " f\"Unexpected parse type encountered: {self._parse_type}\"\n", - " )\n", - " lc_doc = LCDocument(\n", - " page_content=text,\n", - " metadata=DocumentMetadata(\n", - " dl_doc_hash=dl_doc.file_info.document_hash,\n", - " ).model_dump(),\n", - " )\n", - " yield lc_doc" + " dl_doc = self._converter.convert(source).document\n", + " text = dl_doc.export_to_markdown()\n", + " yield LCDocument(page_content=text)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "FILE_PATH = \"https://raw.githubusercontent.com/DS4SD/docling/main/tests/data/2206.01062.pdf\" # DocLayNet paper" ] }, { @@ -141,37 +112,10 @@ "execution_count": 5, "metadata": {}, "outputs": [], - "source": [ - "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1b38d07d5fed4618a44ecf261e1e5c44", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 7 files: 0%| | 0/7 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "\n", - "loader = DoclingPDFLoader(\n", - " file_path=FILE_PATH,\n", - " parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n", - ")\n", + "loader = DoclingPDFLoader(file_path=FILE_PATH)\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=1000,\n", " chunk_overlap=200,\n", @@ -187,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -204,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -223,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -232,7 +176,7 @@ "from langchain_milvus import Milvus\n", "\n", "MILVUS_URI = os.environ.get(\n", - " \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n", + " \"MILVUS_URI\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n", ")\n", "\n", "vectorstore = Milvus.from_documents(\n", @@ -252,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -287,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -319,16 +263,16 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'The human annotation of DocLayNet was performed on 80863 pages.\\n\\nExplanation:\\nThe information is found in the paragraph \"DocLayNet contains 80863 PDF pages\" in the context.'" + "'- 80,863 pages were human annotated for DocLayNet.'" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -336,13 +280,6 @@ "source": [ "rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/examples/rag_llamaindex.ipynb b/docs/examples/rag_llamaindex.ipynb index bf52a0d8..e5b8d68d 100644 --- a/docs/examples/rag_llamaindex.ipynb +++ b/docs/examples/rag_llamaindex.ipynb @@ -14,6 +14,13 @@ "# RAG with LlamaIndex đĻ" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> âšī¸ đ **The LlamaIndex Docling extension update to Docling v2 is ongoing; in the meanwhile, this notebook is showing current extension output, based on Docling v1.**" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/docs/integrations/llamaindex.md b/docs/integrations/llamaindex.md index d3c0f46e..af82da31 100644 --- a/docs/integrations/llamaindex.md +++ b/docs/integrations/llamaindex.md @@ -4,6 +4,10 @@ Docling is available as an official LlamaIndex extension! To get started, check out the [step-by-step guide in LlamaIndex \[â\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/). +!!! info "Docling v2" + + The LlamaIndex Docling extension update to Docling v2 is ongoing. + ## Components ### Docling Reader diff --git a/docs/overrides/main.html b/docs/overrides/main.html index 9071c5ff..195acaf1 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -1,5 +1,5 @@ {% extends "base.html" %} {% block announce %} -
đ Docling is going v2, check out what's new and how to get started!
+đ Docling has gone v2! Check out what's new and how to get started!
{% endblock %}