Merge branch 'main' into nli/layoutmodel_improvements

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2025-07-25 03:24:59 +00:00 · 2025-05-09 14:47:44 +02:00 · 2025-05-09 14:47:44 +02:00 · 6e956dc551
commit 6e956dc551
parent a553a1e5bf 7c705739f9
72 changed files with 2374963 additions and 551 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,26 @@
+## [v2.31.0](https://github.com/docling-project/docling/releases/tag/v2.31.0) - 2025-04-25
+
+### Feature
+
+* Add tutorial using Milvus and Docling for RAG pipeline ([#1449](https://github.com/docling-project/docling/issues/1449)) ([`a2fbbba`](https://github.com/docling-project/docling/commit/a2fbbba9f7f889a1f84f8642cf5c75feb57e8668))
+
+### Fix
+
+* **html:** Handle address, details, and summary tags ([#1436](https://github.com/docling-project/docling/issues/1436)) ([`ed20124`](https://github.com/docling-project/docling/commit/ed20124544a1b10f068b11bbdf12e1bfc7567195))
+* Treat overflowing -v flags as DEBUG ([#1419](https://github.com/docling-project/docling/issues/1419)) ([`8012a3e`](https://github.com/docling-project/docling/commit/8012a3e4d6b9ce4cae28210d525d87175da2f5c2))
+* **codecov:** Fix codecov argument and yaml file ([#1399](https://github.com/docling-project/docling/issues/1399)) ([`fa7fc9e`](https://github.com/docling-project/docling/commit/fa7fc9e63d45f44af57dd6ad7636a2a16f04b8c4))
+
+### Documentation
+
+* Fix wrong output format in example code ([#1427](https://github.com/docling-project/docling/issues/1427)) ([`c2470ed`](https://github.com/docling-project/docling/commit/c2470ed216eaf3aae0ad16306de19682fa55b99b))
+* Add OpenSSF Best Practices badge ([#1430](https://github.com/docling-project/docling/issues/1430)) ([`64918a8`](https://github.com/docling-project/docling/commit/64918a81ac315ea0108f1411a1537dd12117e49c))
+* Typo fixes in docling_document.md ([#1400](https://github.com/docling-project/docling/issues/1400)) ([`995b3b0`](https://github.com/docling-project/docling/commit/995b3b0ab1c4e566eaba2ea31af3db21eb12a7ae))
+* Updated the [Usage] link in architecture.md ([#1416](https://github.com/docling-project/docling/issues/1416)) ([`88948b0`](https://github.com/docling-project/docling/commit/88948b0bbaba2ecbaa71f703d2cc94055a3e6b3e))
+* **ocr:** Add docs entry for OnnxTR OCR plugin ([#1382](https://github.com/docling-project/docling/issues/1382)) ([`a7dd59c`](https://github.com/docling-project/docling/commit/a7dd59c5cb3e7f1eba76c7e2e20be79d8fa5b367))
+* **security:** More statements about secure development ([#1381](https://github.com/docling-project/docling/issues/1381)) ([`293c28c`](https://github.com/docling-project/docling/commit/293c28ca7c4a44dcd56595ed2fe0372fe1b531b2))
+* Add testing in the docs ([#1379](https://github.com/docling-project/docling/issues/1379)) ([`01fbfd5`](https://github.com/docling-project/docling/commit/01fbfd565204258acb2986dcaefad3a328626c66))
+* Add Notes for Installing in Intel macOS ([#1377](https://github.com/docling-project/docling/issues/1377)) ([`a026b4e`](https://github.com/docling-project/docling/commit/a026b4e84bcc8e11ceaa6d9a46c7c741000aff44))
+
 ## [v2.30.0](https://github.com/docling-project/docling/releases/tag/v2.30.0) - 2025-04-14

 ### Feature
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                        )
                    return _txt

-                # restore original HTML by removing previouly added markers
+                # restore original HTML by removing previously added markers
                for regex in [
                    rf"<pre>\s*<code>\s*{_START_MARKER}",
                    rf"{_STOP_MARKER}\s*</code>\s*</pre>",
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):

        # Common styles for bullet and numbered lists.
        # "List Bullet", "List Number", "List Paragraph"
-        # Identify wether list is a numbered list or not
+        # Identify whether list is a numbered list or not
        # is_numbered = "List Bullet" not in paragraph.style.name
        is_numbered = False
        p_style_id, p_level = self._get_label_and_level(paragraph)
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
        super().__init__(in_doc, path_or_stream)
        self.path_or_stream = path_or_stream

-        # Initialize the root of the document hiearchy
+        # Initialize the root of the document hierarchy
        self.root: Optional[NodeItem] = None

        self.valid = False
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@ -1,6 +1,6 @@
 """Backend to parse patents from the United States Patent Office (USPTO).

-The parsers included in this module can handle patent grants pubished since 1976 and
+The parsers included in this module can handle patent grants published since 1976 and
 patent applications since 2001.
 The original files can be found in https://bulkdata.uspto.gov.
 """
@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
                    )

            elif name == self.Element.PARAGRAPH.value and text:
-                # remmove blank spaces added in paragraphs
+                # remove blank spaces added in paragraphs
                text = re.sub("\\s+", " ", text)
                if self.Element.ABSTRACT.value in self.property:
                    self.abstract = (
@ -1697,7 +1697,7 @@ class XmlTable:
 class HtmlEntity:
    """Provide utility functions to get the HTML entities of styled characters.

-    This class has been developped from:
+    This class has been developed from:
    https://unicode-table.com/en/html-entities/
    https://www.w3.org/TR/WD-math-970515/table03.html
    """
@ -1896,7 +1896,7 @@ class HtmlEntity:
        """Get an HTML entity of a greek letter in ISO 8879.

        Args:
-            The text to transform, as an ISO 8879 entitiy.
+            The text to transform, as an ISO 8879 entity.

        Returns:
            The HTML entity representing a greek letter. If the input text is not
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -521,7 +521,7 @@ def convert(  # noqa: C901
            if image_export_mode != ImageRefMode.PLACEHOLDER:
                pipeline_options.generate_page_images = True
                pipeline_options.generate_picture_images = (
-                    True  # FIXME: to be deprecated in verson 3
+                    True  # FIXME: to be deprecated in version 3
                )
                pipeline_options.images_scale = 2

--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -189,7 +189,9 @@ class DocumentConverter:
    def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
        """Generate a hash of pipeline options to use as part of the cache key."""
        options_str = str(pipeline_options.model_dump())
-        return hashlib.md5(options_str.encode("utf-8")).hexdigest()
+        return hashlib.md5(
+            options_str.encode("utf-8"), usedforsecurity=False
+        ).hexdigest()

    def initialize_pipeline(self, format: InputFormat):
        """Initialize the conversion pipeline for the selected format."""
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@ -57,7 +57,10 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
                artifacts_path,
                torch_dtype=torch.bfloat16,
                _attn_implementation=(
-                    "flash_attention_2" if self.device.startswith("cuda") else "eager"
+                    "flash_attention_2"
+                    if self.device.startswith("cuda")
+                    and accelerator_options.cuda_use_flash_attention2
+                    else "eager"
                ),
            ).to(self.device)

--- a/docling/models/readingorder_model.py
+++ b/docling/models/readingorder_model.py
@ -346,7 +346,7 @@ class ReadingOrderModel:
        new_item.prov.append(prov)

    def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
-        with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
+        with TimeRecorder(conv_res, "reading_order", scope=ProfilingScope.DOCUMENT):
            page_elements = self._assembled_to_readingorder_elements(conv_res)

            # Apply reading order
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
                                tcells = table_cluster.cells
                            tokens = []
                            for c in tcells:
-                                # Only allow non empty stings (spaces) into the cells of a table
+                                # Only allow non empty strings (spaces) into the cells of a table
                                if len(c.text.strip()) > 0:
                                    new_cell = copy.deepcopy(c)
                                    new_cell.rect = BoundingRectangle.from_bounding_box(
@ -267,7 +267,7 @@ class TableStructureModel(BasePageModel):
                                    element["bbox"]["token"] = text_piece

                                tc = TableCell.model_validate(element)
-                                if self.do_cell_matching and tc.bbox is not None:
+                                if tc.bbox is not None:
                                    tc.bbox = tc.bbox.scaled(1 / self.scale)
                                table_cells.append(tc)

--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 from collections.abc import Iterable
 from pathlib import Path
@ -38,6 +40,8 @@ class TesseractOcrModel(BaseOcrModel):
        self.options: TesseractOcrOptions

        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+        self.reader = None
+        self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}

        if self.enabled:
            install_errmsg = (
@ -84,9 +88,7 @@ class TesseractOcrModel(BaseOcrModel):
                "oem": tesserocr.OEM.DEFAULT,
            }

-            self.reader = None
            self.osd_reader = None
-            self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}

            if self.options.path is not None:
                tesserocr_kwargs["path"] = self.options.path
@ -151,7 +153,7 @@ class TesseractOcrModel(BaseOcrModel):
                            script = map_tesseract_script(script)
                            lang = f"{self.script_prefix}{script}"

-                            # Check if the detected languge is present in the system
+                            # Check if the detected language is present in the system
                            if lang not in self._tesserocr_languages:
                                msg = f"Tesseract detected the script '{script}' and language '{lang}'."
                                msg += " However this language is not installed in your system and will be ignored."
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@ -20,7 +20,7 @@ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
    """Create a stable page_hash of the path_or_stream of a file"""

    block_size = 65536
-    hasher = hashlib.sha256()
+    hasher = hashlib.sha256(usedforsecurity=False)

    def _hash_buf(binary_stream):
        buf = binary_stream.read(block_size)  # read and page_hash in chunks
@ -38,7 +38,7 @@ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:


 def create_hash(string: str):
-    hasher = hashlib.sha256()
+    hasher = hashlib.sha256(usedforsecurity=False)
    hasher.update(string.encode("utf-8"))

    return hasher.hexdigest()
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@ -569,7 +569,7 @@
    "The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n",
    "In this notebook, we will leverage:\n",
    "- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n",
-    "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vectore store.\n",
+    "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vector store.\n",
    "- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n",
    "\n",
    "Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook."
--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@ -206,7 +206,7 @@
   "source": [
    "Points to notice looking at the output chunks below:\n",
    "- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n",
-    "- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
+    "- Where needed, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
    "- Where possible, we merge undersized peer chunks (see chunk 0)\n",
    "- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)"
   ]
--- a/docs/examples/pictures_description.ipynb
+++ b/docs/examples/pictures_description.ipynb
@ -279,7 +279,7 @@
    "## Use other vision models\n",
    "\n",
    "The examples above can also be reproduced using other vision model.\n",
-    "The Docling options `PictureDescriptionVlmOptions` allows to speficy your favorite vision model from the Hugging Face Hub."
+    "The Docling options `PictureDescriptionVlmOptions` allows to specify your favorite vision model from the Hugging Face Hub."
   ]
  },
  {
--- a/docs/examples/rag_milvus.ipynb
+++ b/docs/examples/rag_milvus.ipynb
@ -0,0 +1,551 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/docling-project/docling/blob/main/docs/examples/rag_milvus.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RAG with Milvus\n",
+    "\n",
+    "| Step | Tech | Execution |\n",
+    "| --- | --- | --- |\n",
+    "| Embedding | OpenAI (text-embedding-3-small) | 🌐 Remote |\n",
+    "| Vector store | Milvus | 💻 Local |\n",
+    "| Gen AI | OpenAI (gpt-4o) | 🌐 Remote |\n",
+    "\n",
+    "\n",
+    "## A recipe 🧑‍🍳 🐥 💚\n",
+    "\n",
+    "This is a code recipe that uses [Milvus](https://milvus.io/), the world's most advanced open-source vector database, to perform RAG over documents parsed by [Docling](https://docling-project.github.io/docling/).\n",
+    "\n",
+    "In this notebook, we accomplish the following:\n",
+    "* Parse documents using Docling's document conversion capabilities\n",
+    "* Perform hierarchical chunking of the documents using Docling\n",
+    "* Generate text embeddings with OpenAI\n",
+    "* Perform RAG using Milvus, the world's most advanced open-source vector database\n",
+    "\n",
+    "Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
+    "1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
+    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preparation\n",
+    "\n",
+    "### Dependencies and Environment\n",
+    "\n",
+    "To start, install the required dependencies by running the following command:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install --upgrade pymilvus docling openai torch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> If you are using Google Colab, to enable dependencies just installed, you may need to **restart the runtime** (click on the \"Runtime\" menu at the top of the screen, and select \"Restart session\" from the dropdown menu)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### GPU Checking"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Part of what makes Docling so remarkable is the fact that it can run on commodity hardware. This means that this notebook can be run on a local machine with GPU acceleration. If you're using a MacBook with a silicon chip, Docling integrates seamlessly with Metal Performance Shaders (MPS). MPS provides out-of-the-box GPU acceleration for macOS, seamlessly integrating with PyTorch and TensorFlow, offering energy-efficient performance on Apple Silicon, and broad compatibility with all Metal-supported GPUs.\n",
+    "\n",
+    "The code below checks to see if a GPU is available, either via CUDA or MPS."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MPS GPU is enabled.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "# Check if GPU or MPS is available\n",
+    "if torch.cuda.is_available():\n",
+    "    device = torch.device(\"cuda\")\n",
+    "    print(f\"CUDA GPU is enabled: {torch.cuda.get_device_name(0)}\")\n",
+    "elif torch.backends.mps.is_available():\n",
+    "    device = torch.device(\"mps\")\n",
+    "    print(\"MPS GPU is enabled.\")\n",
+    "else:\n",
+    "    raise OSError(\n",
+    "        \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Setting Up API Keys\n",
+    "\n",
+    "We will use OpenAI as the LLM in this example. You should prepare the [OPENAI_API_KEY](https://platform.openai.com/docs/quickstart) as an environment variable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-***********\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare the LLM and Embedding Model\n",
+    "\n",
+    "We initialize the OpenAI client to prepare the embedding model.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "openai_client = OpenAI()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Define a function to generate text embeddings using OpenAI client. We use the [text-embedding-3-small](https://platform.openai.com/docs/guides/embeddings) model as an example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def emb_text(text):\n",
+    "    return (\n",
+    "        openai_client.embeddings.create(input=text, model=\"text-embedding-3-small\")\n",
+    "        .data[0]\n",
+    "        .embedding\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Generate a test embedding and print its dimension and first few elements."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1536\n",
+      "[0.009889289736747742, -0.005578675772994757, 0.00683477520942688, -0.03805781528353691, -0.01824733428657055, -0.04121600463986397, -0.007636285852640867, 0.03225184231996536, 0.018949154764413834, 9.352207416668534e-05]\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_embedding = emb_text(\"This is a test\")\n",
+    "embedding_dim = len(test_embedding)\n",
+    "print(embedding_dim)\n",
+    "print(test_embedding[:10])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Process Data Using Docling\n",
+    "\n",
+    "Docling can parse various document formats into a unified representation (Docling Document), which can then be exported to different output formats. For a full list of supported input and output formats, please refer to [the official documentation](https://docling-project.github.io/docling/usage/supported_formats/).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this tutorial, we will use a Markdown file ([source](https://milvus.io/docs/overview.md)) as the input. We will process the document using a **HierarchicalChunker** provided by Docling to generate structured, hierarchical chunks suitable for downstream RAG tasks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from docling_core.transforms.chunker import HierarchicalChunker\n",
+    "\n",
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "converter = DocumentConverter()\n",
+    "chunker = HierarchicalChunker()\n",
+    "\n",
+    "# Convert the input file to Docling Document\n",
+    "source = \"https://milvus.io/docs/overview.md\"\n",
+    "doc = converter.convert(source).document\n",
+    "\n",
+    "# Perform hierarchical chunking\n",
+    "texts = [chunk.text for chunk in chunker.chunk(doc)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data into Milvus\n",
+    "\n",
+    "### Create the collection\n",
+    "\n",
+    "With data in hand, we can create a `MilvusClient` instance and insert the data into a Milvus collection. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pymilvus import MilvusClient\n",
+    "\n",
+    "milvus_client = MilvusClient(uri=\"./milvus_demo.db\")\n",
+    "collection_name = \"my_rag_collection\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> As for the argument of `MilvusClient`:\n",
+    "> - Setting the `uri` as a local file, e.g.`./milvus.db`, is the most convenient method, as it automatically utilizes [Milvus Lite](https://milvus.io/docs/milvus_lite.md) to store all data in this file.\n",
+    "> - If you have large scale of data, you can set up a more performant Milvus server on [docker or kubernetes](https://milvus.io/docs/quickstart.md). In this setup, please use the server uri, e.g.`http://localhost:19530`, as your `uri`.\n",
+    "> - If you want to use [Zilliz Cloud](https://zilliz.com/cloud), the fully managed cloud service for Milvus, adjust the `uri` and `token`, which correspond to the [Public Endpoint and Api key](https://docs.zilliz.com/docs/on-zilliz-cloud-console#free-cluster-details) in Zilliz Cloud."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check if the collection already exists and drop it if it does."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if milvus_client.has_collection(collection_name):\n",
+    "    milvus_client.drop_collection(collection_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a new collection with specified parameters.\n",
+    "\n",
+    "If we don’t specify any field information, Milvus will automatically create a default `id` field for primary key, and a `vector` field to store the vector data. A reserved JSON field is used to store non-schema-defined fields and their values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "milvus_client.create_collection(\n",
+    "    collection_name=collection_name,\n",
+    "    dimension=embedding_dim,\n",
+    "    metric_type=\"IP\",  # Inner product distance\n",
+    "    consistency_level=\"Strong\",  # Supported values are (`\"Strong\"`, `\"Session\"`, `\"Bounded\"`, `\"Eventually\"`). See https://milvus.io/docs/consistency.md#Consistency-Level for more details.\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Insert data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing chunks: 100%|██████████| 38/38 [00:14<00:00,  2.59it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'insert_count': 38, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37], 'cost': 0}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "data = []\n",
+    "\n",
+    "for i, chunk in enumerate(tqdm(texts, desc=\"Processing chunks\")):\n",
+    "    embedding = emb_text(chunk)\n",
+    "    data.append({\"id\": i, \"vector\": embedding, \"text\": chunk})\n",
+    "\n",
+    "milvus_client.insert(collection_name=collection_name, data=data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Build RAG"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Retrieve data for a query\n",
+    "\n",
+    "Let’s specify a query question about the website we just scraped."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = (\n",
+    "    \"What are the three deployment modes of Milvus, and what are their differences?\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Search for the question in the collection and retrieve the semantic top-3 matches."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "search_res = milvus_client.search(\n",
+    "    collection_name=collection_name,\n",
+    "    data=[emb_text(question)],\n",
+    "    limit=3,\n",
+    "    search_params={\"metric_type\": \"IP\", \"params\": {}},\n",
+    "    output_fields=[\"text\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let’s take a look at the search results of the query\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[\n",
+      "    [\n",
+      "        \"Milvus offers three deployment modes, covering a wide range of data scales\\u2014from local prototyping in Jupyter Notebooks to massive Kubernetes clusters managing tens of billions of vectors:\",\n",
+      "        0.6503315567970276\n",
+      "    ],\n",
+      "    [\n",
+      "        \"Milvus Lite is a Python library that can be easily integrated into your applications. As a lightweight version of Milvus, it\\u2019s ideal for quick prototyping in Jupyter Notebooks or running on edge devices with limited resources. Learn more.\\nMilvus Standalone is a single-machine server deployment, with all components bundled into a single Docker image for convenient deployment. Learn more.\\nMilvus Distributed can be deployed on Kubernetes clusters, featuring a cloud-native architecture designed for billion-scale or even larger scenarios. This architecture ensures redundancy in critical components. Learn more.\",\n",
+      "        0.6281915903091431\n",
+      "    ],\n",
+      "    [\n",
+      "        \"What is Milvus?\\nUnstructured Data, Embeddings, and Milvus\\nWhat Makes Milvus so Fast\\uff1f\\nWhat Makes Milvus so Scalable\\nTypes of Searches Supported by Milvus\\nComprehensive Feature Set\",\n",
+      "        0.6117826700210571\n",
+      "    ]\n",
+      "]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "retrieved_lines_with_distances = [\n",
+    "    (res[\"entity\"][\"text\"], res[\"distance\"]) for res in search_res[0]\n",
+    "]\n",
+    "print(json.dumps(retrieved_lines_with_distances, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Use LLM to get a RAG response\n",
+    "\n",
+    "Convert the retrieved documents into a string format.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context = \"\\n\".join(\n",
+    "    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Define system and user prompts for the Lanage Model. This prompt is assembled with the retrieved documents from Milvus.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SYSTEM_PROMPT = \"\"\"\n",
+    "Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.\n",
+    "\"\"\"\n",
+    "USER_PROMPT = f\"\"\"\n",
+    "Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.\n",
+    "<context>\n",
+    "{context}\n",
+    "</context>\n",
+    "<question>\n",
+    "{question}\n",
+    "</question>\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use OpenAI ChatGPT to generate a response based on the prompts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The three deployment modes of Milvus are:\n",
+      "\n",
+      "1. **Milvus Lite**: This is a Python library that integrates easily into your applications. It's a lightweight version ideal for quick prototyping in Jupyter Notebooks or for running on edge devices with limited resources.\n",
+      "\n",
+      "2. **Milvus Standalone**: This mode is a single-machine server deployment where all components are bundled into a single Docker image, making it convenient to deploy.\n",
+      "\n",
+      "3. **Milvus Distributed**: This mode is designed for deployment on Kubernetes clusters. It features a cloud-native architecture suited for managing scenarios at a billion-scale or larger, ensuring redundancy in critical components.\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = openai_client.chat.completions.create(\n",
+    "    model=\"gpt-4o\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+    "        {\"role\": \"user\", \"content\": USER_PROMPT},\n",
+    "    ],\n",
+    ")\n",
+    "print(response.choices[0].message.content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/examples/rag_weaviate.ipynb
+++ b/docs/examples/rag_weaviate.ipynb
@ -43,7 +43,7 @@
    "\n",
    "Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
    "1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
-    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU."
+    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU."
   ]
  },
  {
@ -716,7 +716,7 @@
    "id": "7tGz49nfUegG"
   },
   "source": [
-    "We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documetation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
+    "We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documentation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
   ]
  }
 ],
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@ -137,7 +137,7 @@ This is a collection of FAQ collected from the user questions on <https://github
    ### Some images are missing from MS Word and Powerpoint

    The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
-    If you are on other operaring systems, these images will be ignored.
+    If you are on other operating systems, these images will be ignored.


 ??? question "`HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'"
--- a/docs/usage/index.md
+++ b/docs/usage/index.md
@ -200,7 +200,7 @@ You can limit the CPU threads used by Docling by setting the environment variabl
    using a `DocumentConverter` (high-level API) as discussed in the sections above
    should suffice — and is the recommended way.

-By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](../supported_formats.md)).
+By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](supported_formats.md)).
 You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](../examples/run_with_formats.py) example.
 Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:

--- a/docs/v2.md
+++ b/docs/v2.md
@ -37,7 +37,7 @@ docling ./input/dir --output ./scratch --abort-on-error

 ### Setting up a `DocumentConverter`

-To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
+To accommodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
 You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options
 per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults
 will be used for all `allowed_formats`.
@ -151,7 +151,7 @@ conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/240
 ## Inspect the converted document:
 conv_result.document.print_element_tree()

-## Iterate the elements in reading order, including hierachy level:
+## Iterate the elements in reading order, including hierarchy level:
 for item, level in conv_result.document.iterate_items():
    if isinstance(item, TextItem):
        print(item.text)
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -101,6 +101,7 @@ nav:
      - "Figure enrichment": examples/develop_picture_enrichment.py
      - "Formula enrichment": examples/develop_formula_understanding.py
    - 🗂️ More examples:
+      - examples/rag_milvus.ipynb
      - examples/rag_weaviate.ipynb
      - RAG with Granite [↗]: https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb
      - examples/rag_azuresearch.ipynb
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.30.0"  # DO NOT EDIT, updated automatically
+version = "2.31.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = [
  "Christoph Auer <cau@zurich.ibm.com>",
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v1/amt_handbook_sample.json
+++ b/tests/data/groundtruth/docling_v1/amt_handbook_sample.json
--- a/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v1/code_and_formula.json
+++ b/tests/data/groundtruth/docling_v1/code_and_formula.json
--- a/tests/data/groundtruth/docling_v1/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v1/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v1/picture_classification.json
+++ b/tests/data/groundtruth/docling_v1/picture_classification.json
--- a/tests/data/groundtruth/docling_v1/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v1/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_01.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_01.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_02.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_02.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_03.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_03.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.json
+++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.json
--- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v2/code_and_formula.json
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.json
--- a/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v2/picture_classification.json
+++ b/tests/data/groundtruth/docling_v2/picture_classification.json
--- a/tests/data/groundtruth/docling_v2/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v2/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_03.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_03.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_03.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.json
@ -1 +1,83 @@
-{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.5883585611979, 506.6666666666667, 767.2550252278646], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
+{
+  "_name": "",
+  "type": "pdf-document",
+  "description": {
+    "title": null,
+    "abstract": null,
+    "authors": null,
+    "affiliations": null,
+    "subjects": null,
+    "keywords": null,
+    "publication_date": null,
+    "languages": null,
+    "license": null,
+    "publishers": null,
+    "url_refs": null,
+    "references": null,
+    "publication": null,
+    "reference_count": null,
+    "citation_count": null,
+    "citation_date": null,
+    "advanced": null,
+    "analytics": null,
+    "logs": [],
+    "collection": null,
+    "acquisition": null
+  },
+  "file-info": {
+    "filename": "ocr_test.pdf",
+    "filename-prov": null,
+    "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61",
+    "#-pages": 1,
+    "collection-name": null,
+    "description": null,
+    "page-hashes": [
+      {
+        "hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3",
+        "model": "default",
+        "page": 1
+      }
+    ]
+  },
+  "main-text": [
+    {
+      "prov": [
+        {
+          "bbox": [
+            69.0,
+            688.5883585611979,
+            506.6666666666667,
+            767.2550252278646
+          ],
+          "page": 1,
+          "span": [
+            0,
+            94
+          ],
+          "__ref_s3_data": null
+        }
+      ],
+      "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
+      "type": "paragraph",
+      "payload": null,
+      "name": "Text",
+      "font": null
+    }
+  ],
+  "figures": [],
+  "tables": [],
+  "bitmaps": null,
+  "equations": [],
+  "footnotes": [],
+  "page-dimensions": [
+    {
+      "height": 841.9216918945312,
+      "page": 1,
+      "width": 595.201171875
+    }
+  ],
+  "page-footers": [],
+  "page-headers": [],
+  "_s3_data": null,
+  "identifiers": null
+}
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
@ -1 +1,77 @@
-{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.3.0",
+  "name": "ocr_test",
+  "origin": {
+    "mimetype": "application/pdf",
+    "binary_hash": 14853448746796404529,
+    "filename": "ocr_test.pdf",
+    "uri": null
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "parent": null,
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "parent": null,
+    "children": [
+      {
+        "cref": "#/texts/0"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "cref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "page_no": 1,
+          "bbox": {
+            "l": 69.0,
+            "t": 767.2550252278646,
+            "r": 506.6666666666667,
+            "b": 688.5883585611979,
+            "coord_origin": "BOTTOMLEFT"
+          },
+          "charspan": [
+            0,
+            94
+          ]
+        }
+      ],
+      "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
+      "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
+      "formatting": null,
+      "hyperlink": null
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {
+    "1": {
+      "size": {
+        "width": 595.201171875,
+        "height": 841.9216918945312
+      },
+      "image": null,
+      "page_no": 1
+    }
+  }
+}
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -293,6 +293,7 @@ def verify_conversion_result_v1(
    generate: bool = False,
    ocr_engine: Optional[str] = None,
    fuzzy: bool = False,
+    indent: int = 2,
 ):
    PageList = TypeAdapter(List[Page])

@ -323,11 +324,13 @@ def verify_conversion_result_v1(
    if generate:  # only used when re-generating truth
        pages_path.parent.mkdir(parents=True, exist_ok=True)
        with open(pages_path, "w") as fw:
-            fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
+            fw.write(
+                json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
+            )

        json_path.parent.mkdir(parents=True, exist_ok=True)
        with open(json_path, "w") as fw:
-            fw.write(json.dumps(doc_pred, default=pydantic_encoder))
+            fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))

        md_path.parent.mkdir(parents=True, exist_ok=True)
        with open(md_path, "w") as fw:
@ -377,6 +380,7 @@ def verify_conversion_result_v2(
    generate: bool = False,
    ocr_engine: Optional[str] = None,
    fuzzy: bool = False,
+    indent: int = 2,
 ):
    PageList = TypeAdapter(List[Page])

@ -405,11 +409,13 @@ def verify_conversion_result_v2(
    if generate:  # only used when re-generating truth
        pages_path.parent.mkdir(parents=True, exist_ok=True)
        with open(pages_path, "w") as fw:
-            fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
+            fw.write(
+                json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
+            )

        json_path.parent.mkdir(parents=True, exist_ok=True)
        with open(json_path, "w") as fw:
-            fw.write(json.dumps(doc_pred, default=pydantic_encoder))
+            fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))

        md_path.parent.mkdir(parents=True, exist_ok=True)
        with open(md_path, "w") as fw: