Merge branch 'main' into nli/layoutmodel_improvements

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2025-05-09 14:47:44 +02:00
commit 6e956dc551
72 changed files with 2374963 additions and 551 deletions

View File

@ -1,3 +1,26 @@
## [v2.31.0](https://github.com/docling-project/docling/releases/tag/v2.31.0) - 2025-04-25
### Feature
* Add tutorial using Milvus and Docling for RAG pipeline ([#1449](https://github.com/docling-project/docling/issues/1449)) ([`a2fbbba`](https://github.com/docling-project/docling/commit/a2fbbba9f7f889a1f84f8642cf5c75feb57e8668))
### Fix
* **html:** Handle address, details, and summary tags ([#1436](https://github.com/docling-project/docling/issues/1436)) ([`ed20124`](https://github.com/docling-project/docling/commit/ed20124544a1b10f068b11bbdf12e1bfc7567195))
* Treat overflowing -v flags as DEBUG ([#1419](https://github.com/docling-project/docling/issues/1419)) ([`8012a3e`](https://github.com/docling-project/docling/commit/8012a3e4d6b9ce4cae28210d525d87175da2f5c2))
* **codecov:** Fix codecov argument and yaml file ([#1399](https://github.com/docling-project/docling/issues/1399)) ([`fa7fc9e`](https://github.com/docling-project/docling/commit/fa7fc9e63d45f44af57dd6ad7636a2a16f04b8c4))
### Documentation
* Fix wrong output format in example code ([#1427](https://github.com/docling-project/docling/issues/1427)) ([`c2470ed`](https://github.com/docling-project/docling/commit/c2470ed216eaf3aae0ad16306de19682fa55b99b))
* Add OpenSSF Best Practices badge ([#1430](https://github.com/docling-project/docling/issues/1430)) ([`64918a8`](https://github.com/docling-project/docling/commit/64918a81ac315ea0108f1411a1537dd12117e49c))
* Typo fixes in docling_document.md ([#1400](https://github.com/docling-project/docling/issues/1400)) ([`995b3b0`](https://github.com/docling-project/docling/commit/995b3b0ab1c4e566eaba2ea31af3db21eb12a7ae))
* Updated the [Usage] link in architecture.md ([#1416](https://github.com/docling-project/docling/issues/1416)) ([`88948b0`](https://github.com/docling-project/docling/commit/88948b0bbaba2ecbaa71f703d2cc94055a3e6b3e))
* **ocr:** Add docs entry for OnnxTR OCR plugin ([#1382](https://github.com/docling-project/docling/issues/1382)) ([`a7dd59c`](https://github.com/docling-project/docling/commit/a7dd59c5cb3e7f1eba76c7e2e20be79d8fa5b367))
* **security:** More statements about secure development ([#1381](https://github.com/docling-project/docling/issues/1381)) ([`293c28c`](https://github.com/docling-project/docling/commit/293c28ca7c4a44dcd56595ed2fe0372fe1b531b2))
* Add testing in the docs ([#1379](https://github.com/docling-project/docling/issues/1379)) ([`01fbfd5`](https://github.com/docling-project/docling/commit/01fbfd565204258acb2986dcaefad3a328626c66))
* Add Notes for Installing in Intel macOS ([#1377](https://github.com/docling-project/docling/issues/1377)) ([`a026b4e`](https://github.com/docling-project/docling/commit/a026b4e84bcc8e11ceaa6d9a46c7c741000aff44))
## [v2.30.0](https://github.com/docling-project/docling/releases/tag/v2.30.0) - 2025-04-14
### Feature

View File

@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
)
return _txt
# restore original HTML by removing previouly added markers
# restore original HTML by removing previously added markers
for regex in [
rf"<pre>\s*<code>\s*{_START_MARKER}",
rf"{_STOP_MARKER}\s*</code>\s*</pre>",

View File

@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph"
# Identify wether list is a numbered list or not
# Identify whether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False
p_style_id, p_level = self._get_label_and_level(paragraph)

View File

@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
super().__init__(in_doc, path_or_stream)
self.path_or_stream = path_or_stream
# Initialize the root of the document hiearchy
# Initialize the root of the document hierarchy
self.root: Optional[NodeItem] = None
self.valid = False

View File

@ -1,6 +1,6 @@
"""Backend to parse patents from the United States Patent Office (USPTO).
The parsers included in this module can handle patent grants pubished since 1976 and
The parsers included in this module can handle patent grants published since 1976 and
patent applications since 2001.
The original files can be found in https://bulkdata.uspto.gov.
"""
@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
)
elif name == self.Element.PARAGRAPH.value and text:
# remmove blank spaces added in paragraphs
# remove blank spaces added in paragraphs
text = re.sub("\\s+", " ", text)
if self.Element.ABSTRACT.value in self.property:
self.abstract = (
@ -1697,7 +1697,7 @@ class XmlTable:
class HtmlEntity:
"""Provide utility functions to get the HTML entities of styled characters.
This class has been developped from:
This class has been developed from:
https://unicode-table.com/en/html-entities/
https://www.w3.org/TR/WD-math-970515/table03.html
"""
@ -1896,7 +1896,7 @@ class HtmlEntity:
"""Get an HTML entity of a greek letter in ISO 8879.
Args:
The text to transform, as an ISO 8879 entitiy.
The text to transform, as an ISO 8879 entity.
Returns:
The HTML entity representing a greek letter. If the input text is not

View File

@ -521,7 +521,7 @@ def convert( # noqa: C901
if image_export_mode != ImageRefMode.PLACEHOLDER:
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = (
True # FIXME: to be deprecated in verson 3
True # FIXME: to be deprecated in version 3
)
pipeline_options.images_scale = 2

View File

@ -189,7 +189,9 @@ class DocumentConverter:
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
"""Generate a hash of pipeline options to use as part of the cache key."""
options_str = str(pipeline_options.model_dump())
return hashlib.md5(options_str.encode("utf-8")).hexdigest()
return hashlib.md5(
options_str.encode("utf-8"), usedforsecurity=False
).hexdigest()
def initialize_pipeline(self, format: InputFormat):
"""Initialize the conversion pipeline for the selected format."""

View File

@ -57,7 +57,10 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
artifacts_path,
torch_dtype=torch.bfloat16,
_attn_implementation=(
"flash_attention_2" if self.device.startswith("cuda") else "eager"
"flash_attention_2"
if self.device.startswith("cuda")
and accelerator_options.cuda_use_flash_attention2
else "eager"
),
).to(self.device)

View File

@ -346,7 +346,7 @@ class ReadingOrderModel:
new_item.prov.append(prov)
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
with TimeRecorder(conv_res, "reading_order", scope=ProfilingScope.DOCUMENT):
page_elements = self._assembled_to_readingorder_elements(conv_res)
# Apply reading order

View File

@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
tcells = table_cluster.cells
tokens = []
for c in tcells:
# Only allow non empty stings (spaces) into the cells of a table
# Only allow non empty strings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
new_cell = copy.deepcopy(c)
new_cell.rect = BoundingRectangle.from_bounding_box(
@ -267,7 +267,7 @@ class TableStructureModel(BasePageModel):
element["bbox"]["token"] = text_piece
tc = TableCell.model_validate(element)
if self.do_cell_matching and tc.bbox is not None:
if tc.bbox is not None:
tc.bbox = tc.bbox.scaled(1 / self.scale)
table_cells.append(tc)

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import logging
from collections.abc import Iterable
from pathlib import Path
@ -38,6 +40,8 @@ class TesseractOcrModel(BaseOcrModel):
self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.enabled:
install_errmsg = (
@ -84,9 +88,7 @@ class TesseractOcrModel(BaseOcrModel):
"oem": tesserocr.OEM.DEFAULT,
}
self.reader = None
self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path
@ -151,7 +153,7 @@ class TesseractOcrModel(BaseOcrModel):
script = map_tesseract_script(script)
lang = f"{self.script_prefix}{script}"
# Check if the detected languge is present in the system
# Check if the detected language is present in the system
if lang not in self._tesserocr_languages:
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
msg += " However this language is not installed in your system and will be ignored."

View File

@ -20,7 +20,7 @@ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
"""Create a stable page_hash of the path_or_stream of a file"""
block_size = 65536
hasher = hashlib.sha256()
hasher = hashlib.sha256(usedforsecurity=False)
def _hash_buf(binary_stream):
buf = binary_stream.read(block_size) # read and page_hash in chunks
@ -38,7 +38,7 @@ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
def create_hash(string: str):
hasher = hashlib.sha256()
hasher = hashlib.sha256(usedforsecurity=False)
hasher.update(string.encode("utf-8"))
return hasher.hexdigest()

View File

@ -569,7 +569,7 @@
"The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n",
"In this notebook, we will leverage:\n",
"- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n",
"- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vectore store.\n",
"- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vector store.\n",
"- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n",
"\n",
"Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook."

View File

@ -206,7 +206,7 @@
"source": [
"Points to notice looking at the output chunks below:\n",
"- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n",
"- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
"- Where needed, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
"- Where possible, we merge undersized peer chunks (see chunk 0)\n",
"- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)"
]

View File

@ -279,7 +279,7 @@
"## Use other vision models\n",
"\n",
"The examples above can also be reproduced using other vision model.\n",
"The Docling options `PictureDescriptionVlmOptions` allows to speficy your favorite vision model from the Hugging Face Hub."
"The Docling options `PictureDescriptionVlmOptions` allows to specify your favorite vision model from the Hugging Face Hub."
]
},
{

View File

@ -0,0 +1,551 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/docling-project/docling/blob/main/docs/examples/rag_milvus.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RAG with Milvus\n",
"\n",
"| Step | Tech | Execution |\n",
"| --- | --- | --- |\n",
"| Embedding | OpenAI (text-embedding-3-small) | 🌐 Remote |\n",
"| Vector store | Milvus | 💻 Local |\n",
"| Gen AI | OpenAI (gpt-4o) | 🌐 Remote |\n",
"\n",
"\n",
"## A recipe 🧑‍🍳 🐥 💚\n",
"\n",
"This is a code recipe that uses [Milvus](https://milvus.io/), the world's most advanced open-source vector database, to perform RAG over documents parsed by [Docling](https://docling-project.github.io/docling/).\n",
"\n",
"In this notebook, we accomplish the following:\n",
"* Parse documents using Docling's document conversion capabilities\n",
"* Perform hierarchical chunking of the documents using Docling\n",
"* Generate text embeddings with OpenAI\n",
"* Perform RAG using Milvus, the world's most advanced open-source vector database\n",
"\n",
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preparation\n",
"\n",
"### Dependencies and Environment\n",
"\n",
"To start, install the required dependencies by running the following command:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install --upgrade pymilvus docling openai torch"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> If you are using Google Colab, to enable dependencies just installed, you may need to **restart the runtime** (click on the \"Runtime\" menu at the top of the screen, and select \"Restart session\" from the dropdown menu)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### GPU Checking"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Part of what makes Docling so remarkable is the fact that it can run on commodity hardware. This means that this notebook can be run on a local machine with GPU acceleration. If you're using a MacBook with a silicon chip, Docling integrates seamlessly with Metal Performance Shaders (MPS). MPS provides out-of-the-box GPU acceleration for macOS, seamlessly integrating with PyTorch and TensorFlow, offering energy-efficient performance on Apple Silicon, and broad compatibility with all Metal-supported GPUs.\n",
"\n",
"The code below checks to see if a GPU is available, either via CUDA or MPS."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MPS GPU is enabled.\n"
]
}
],
"source": [
"import torch\n",
"\n",
"# Check if GPU or MPS is available\n",
"if torch.cuda.is_available():\n",
" device = torch.device(\"cuda\")\n",
" print(f\"CUDA GPU is enabled: {torch.cuda.get_device_name(0)}\")\n",
"elif torch.backends.mps.is_available():\n",
" device = torch.device(\"mps\")\n",
" print(\"MPS GPU is enabled.\")\n",
"else:\n",
" raise OSError(\n",
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setting Up API Keys\n",
"\n",
"We will use OpenAI as the LLM in this example. You should prepare the [OPENAI_API_KEY](https://platform.openai.com/docs/quickstart) as an environment variable."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"sk-***********\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Prepare the LLM and Embedding Model\n",
"\n",
"We initialize the OpenAI client to prepare the embedding model.\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from openai import OpenAI\n",
"\n",
"openai_client = OpenAI()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define a function to generate text embeddings using OpenAI client. We use the [text-embedding-3-small](https://platform.openai.com/docs/guides/embeddings) model as an example."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def emb_text(text):\n",
" return (\n",
" openai_client.embeddings.create(input=text, model=\"text-embedding-3-small\")\n",
" .data[0]\n",
" .embedding\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Generate a test embedding and print its dimension and first few elements."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1536\n",
"[0.009889289736747742, -0.005578675772994757, 0.00683477520942688, -0.03805781528353691, -0.01824733428657055, -0.04121600463986397, -0.007636285852640867, 0.03225184231996536, 0.018949154764413834, 9.352207416668534e-05]\n"
]
}
],
"source": [
"test_embedding = emb_text(\"This is a test\")\n",
"embedding_dim = len(test_embedding)\n",
"print(embedding_dim)\n",
"print(test_embedding[:10])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Process Data Using Docling\n",
"\n",
"Docling can parse various document formats into a unified representation (Docling Document), which can then be exported to different output formats. For a full list of supported input and output formats, please refer to [the official documentation](https://docling-project.github.io/docling/usage/supported_formats/).\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this tutorial, we will use a Markdown file ([source](https://milvus.io/docs/overview.md)) as the input. We will process the document using a **HierarchicalChunker** provided by Docling to generate structured, hierarchical chunks suitable for downstream RAG tasks."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from docling_core.transforms.chunker import HierarchicalChunker\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"converter = DocumentConverter()\n",
"chunker = HierarchicalChunker()\n",
"\n",
"# Convert the input file to Docling Document\n",
"source = \"https://milvus.io/docs/overview.md\"\n",
"doc = converter.convert(source).document\n",
"\n",
"# Perform hierarchical chunking\n",
"texts = [chunk.text for chunk in chunker.chunk(doc)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data into Milvus\n",
"\n",
"### Create the collection\n",
"\n",
"With data in hand, we can create a `MilvusClient` instance and insert the data into a Milvus collection. "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from pymilvus import MilvusClient\n",
"\n",
"milvus_client = MilvusClient(uri=\"./milvus_demo.db\")\n",
"collection_name = \"my_rag_collection\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> As for the argument of `MilvusClient`:\n",
"> - Setting the `uri` as a local file, e.g.`./milvus.db`, is the most convenient method, as it automatically utilizes [Milvus Lite](https://milvus.io/docs/milvus_lite.md) to store all data in this file.\n",
"> - If you have large scale of data, you can set up a more performant Milvus server on [docker or kubernetes](https://milvus.io/docs/quickstart.md). In this setup, please use the server uri, e.g.`http://localhost:19530`, as your `uri`.\n",
"> - If you want to use [Zilliz Cloud](https://zilliz.com/cloud), the fully managed cloud service for Milvus, adjust the `uri` and `token`, which correspond to the [Public Endpoint and Api key](https://docs.zilliz.com/docs/on-zilliz-cloud-console#free-cluster-details) in Zilliz Cloud."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check if the collection already exists and drop it if it does."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"if milvus_client.has_collection(collection_name):\n",
" milvus_client.drop_collection(collection_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a new collection with specified parameters.\n",
"\n",
"If we dont specify any field information, Milvus will automatically create a default `id` field for primary key, and a `vector` field to store the vector data. A reserved JSON field is used to store non-schema-defined fields and their values."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"milvus_client.create_collection(\n",
" collection_name=collection_name,\n",
" dimension=embedding_dim,\n",
" metric_type=\"IP\", # Inner product distance\n",
" consistency_level=\"Strong\", # Supported values are (`\"Strong\"`, `\"Session\"`, `\"Bounded\"`, `\"Eventually\"`). See https://milvus.io/docs/consistency.md#Consistency-Level for more details.\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Insert data"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing chunks: 100%|██████████| 38/38 [00:14<00:00, 2.59it/s]\n"
]
},
{
"data": {
"text/plain": [
"{'insert_count': 38, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37], 'cost': 0}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from tqdm import tqdm\n",
"\n",
"data = []\n",
"\n",
"for i, chunk in enumerate(tqdm(texts, desc=\"Processing chunks\")):\n",
" embedding = emb_text(chunk)\n",
" data.append({\"id\": i, \"vector\": embedding, \"text\": chunk})\n",
"\n",
"milvus_client.insert(collection_name=collection_name, data=data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Build RAG"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieve data for a query\n",
"\n",
"Lets specify a query question about the website we just scraped."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"question = (\n",
" \"What are the three deployment modes of Milvus, and what are their differences?\"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Search for the question in the collection and retrieve the semantic top-3 matches."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"search_res = milvus_client.search(\n",
" collection_name=collection_name,\n",
" data=[emb_text(question)],\n",
" limit=3,\n",
" search_params={\"metric_type\": \"IP\", \"params\": {}},\n",
" output_fields=[\"text\"],\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Lets take a look at the search results of the query\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\n",
" [\n",
" \"Milvus offers three deployment modes, covering a wide range of data scales\\u2014from local prototyping in Jupyter Notebooks to massive Kubernetes clusters managing tens of billions of vectors:\",\n",
" 0.6503315567970276\n",
" ],\n",
" [\n",
" \"Milvus Lite is a Python library that can be easily integrated into your applications. As a lightweight version of Milvus, it\\u2019s ideal for quick prototyping in Jupyter Notebooks or running on edge devices with limited resources. Learn more.\\nMilvus Standalone is a single-machine server deployment, with all components bundled into a single Docker image for convenient deployment. Learn more.\\nMilvus Distributed can be deployed on Kubernetes clusters, featuring a cloud-native architecture designed for billion-scale or even larger scenarios. This architecture ensures redundancy in critical components. Learn more.\",\n",
" 0.6281915903091431\n",
" ],\n",
" [\n",
" \"What is Milvus?\\nUnstructured Data, Embeddings, and Milvus\\nWhat Makes Milvus so Fast\\uff1f\\nWhat Makes Milvus so Scalable\\nTypes of Searches Supported by Milvus\\nComprehensive Feature Set\",\n",
" 0.6117826700210571\n",
" ]\n",
"]\n"
]
}
],
"source": [
"import json\n",
"\n",
"retrieved_lines_with_distances = [\n",
" (res[\"entity\"][\"text\"], res[\"distance\"]) for res in search_res[0]\n",
"]\n",
"print(json.dumps(retrieved_lines_with_distances, indent=4))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Use LLM to get a RAG response\n",
"\n",
"Convert the retrieved documents into a string format.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"context = \"\\n\".join(\n",
" [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define system and user prompts for the Lanage Model. This prompt is assembled with the retrieved documents from Milvus.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"SYSTEM_PROMPT = \"\"\"\n",
"Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.\n",
"\"\"\"\n",
"USER_PROMPT = f\"\"\"\n",
"Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.\n",
"<context>\n",
"{context}\n",
"</context>\n",
"<question>\n",
"{question}\n",
"</question>\n",
"\"\"\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use OpenAI ChatGPT to generate a response based on the prompts."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The three deployment modes of Milvus are:\n",
"\n",
"1. **Milvus Lite**: This is a Python library that integrates easily into your applications. It's a lightweight version ideal for quick prototyping in Jupyter Notebooks or for running on edge devices with limited resources.\n",
"\n",
"2. **Milvus Standalone**: This mode is a single-machine server deployment where all components are bundled into a single Docker image, making it convenient to deploy.\n",
"\n",
"3. **Milvus Distributed**: This mode is designed for deployment on Kubernetes clusters. It features a cloud-native architecture suited for managing scenarios at a billion-scale or larger, ensuring redundancy in critical components.\n"
]
}
],
"source": [
"response = openai_client.chat.completions.create(\n",
" model=\"gpt-4o\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
" {\"role\": \"user\", \"content\": USER_PROMPT},\n",
" ],\n",
")\n",
"print(response.choices[0].message.content)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -43,7 +43,7 @@
"\n",
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU."
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU."
]
},
{
@ -716,7 +716,7 @@
"id": "7tGz49nfUegG"
},
"source": [
"We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documetation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
"We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documentation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
]
}
],

View File

@ -137,7 +137,7 @@ This is a collection of FAQ collected from the user questions on <https://github
### Some images are missing from MS Word and Powerpoint
The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
If you are on other operaring systems, these images will be ignored.
If you are on other operating systems, these images will be ignored.
??? question "`HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'"

View File

@ -200,7 +200,7 @@ You can limit the CPU threads used by Docling by setting the environment variabl
using a `DocumentConverter` (high-level API) as discussed in the sections above
should suffice  and is the recommended way.
By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](../supported_formats.md)).
By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](supported_formats.md)).
You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](../examples/run_with_formats.py) example.
Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:

View File

@ -37,7 +37,7 @@ docling ./input/dir --output ./scratch --abort-on-error
### Setting up a `DocumentConverter`
To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
To accommodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options
per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults
will be used for all `allowed_formats`.
@ -151,7 +151,7 @@ conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/240
## Inspect the converted document:
conv_result.document.print_element_tree()
## Iterate the elements in reading order, including hierachy level:
## Iterate the elements in reading order, including hierarchy level:
for item, level in conv_result.document.iterate_items():
if isinstance(item, TextItem):
print(item.text)

View File

@ -101,6 +101,7 @@ nav:
- "Figure enrichment": examples/develop_picture_enrichment.py
- "Formula enrichment": examples/develop_formula_understanding.py
- 🗂️ More examples:
- examples/rag_milvus.ipynb
- examples/rag_weaviate.ipynb
- RAG with Granite [↗]: https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb
- examples/rag_azuresearch.ipynb

984
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "docling"
version = "2.30.0" # DO NOT EDIT, updated automatically
version = "2.31.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = [
"Christoph Auer <cau@zurich.ibm.com>",

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1 +1,83 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.5883585611979, 506.6666666666667, 767.2550252278646], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
{
"_name": "",
"type": "pdf-document",
"description": {
"title": null,
"abstract": null,
"authors": null,
"affiliations": null,
"subjects": null,
"keywords": null,
"publication_date": null,
"languages": null,
"license": null,
"publishers": null,
"url_refs": null,
"references": null,
"publication": null,
"reference_count": null,
"citation_count": null,
"citation_date": null,
"advanced": null,
"analytics": null,
"logs": [],
"collection": null,
"acquisition": null
},
"file-info": {
"filename": "ocr_test.pdf",
"filename-prov": null,
"document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61",
"#-pages": 1,
"collection-name": null,
"description": null,
"page-hashes": [
{
"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3",
"model": "default",
"page": 1
}
]
},
"main-text": [
{
"prov": [
{
"bbox": [
69.0,
688.5883585611979,
506.6666666666667,
767.2550252278646
],
"page": 1,
"span": [
0,
94
],
"__ref_s3_data": null
}
],
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
}
],
"figures": [],
"tables": [],
"bitmaps": null,
"equations": [],
"footnotes": [],
"page-dimensions": [
{
"height": 841.9216918945312,
"page": 1,
"width": 595.201171875
}
],
"page-footers": [],
"page-headers": [],
"_s3_data": null,
"identifiers": null
}

File diff suppressed because one or more lines are too long

View File

@ -1 +1,77 @@
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "ocr_test",
"origin": {
"mimetype": "application/pdf",
"binary_hash": 14853448746796404529,
"filename": "ocr_test.pdf",
"uri": null
},
"furniture": {
"self_ref": "#/furniture",
"parent": null,
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"parent": null,
"children": [
{
"cref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"cref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 69.0,
"t": 767.2550252278646,
"r": 506.6666666666667,
"b": 688.5883585611979,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
94
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
"formatting": null,
"hyperlink": null
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 595.201171875,
"height": 841.9216918945312
},
"image": null,
"page_no": 1
}
}
}

File diff suppressed because one or more lines are too long

View File

@ -293,6 +293,7 @@ def verify_conversion_result_v1(
generate: bool = False,
ocr_engine: Optional[str] = None,
fuzzy: bool = False,
indent: int = 2,
):
PageList = TypeAdapter(List[Page])
@ -323,11 +324,13 @@ def verify_conversion_result_v1(
if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
fw.write(
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
)
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw:
@ -377,6 +380,7 @@ def verify_conversion_result_v2(
generate: bool = False,
ocr_engine: Optional[str] = None,
fuzzy: bool = False,
indent: int = 2,
):
PageList = TypeAdapter(List[Page])
@ -405,11 +409,13 @@ def verify_conversion_result_v2(
if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
fw.write(
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
)
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw: