Updates for DoclingParseV3DocumentBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2025-03-10 17:11:20 +01:00
parent 3f0e98b1ad 4d64c4c0b6
commit 099aa4da83
288 changed files with 33999 additions and 36514 deletions
--- a/docs/examples/backend_csv.ipynb
+++ b/docs/examples/backend_csv.ipynb
@@ -0,0 +1,80 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Conversion of CSV files\n",
+    "\n",
+    "This example shows how to convert CSV files to a structured Docling Document.\n",
+    "\n",
+    "* Multiple delimiters are supported: `,` `;` `|` `[tab]`\n",
+    "* Additional CSV dialect settings are detected automatically (e.g. quotes, line separator, escape character)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "# Convert CSV to Docling document\n",
+    "converter = DocumentConverter()\n",
+    "result = converter.convert(Path(\"../../tests/data/csv/csv-comma.csv\"))\n",
+    "output = result.document.export_to_markdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This code generates the following output:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "|   Index | Customer Id     | First Name   | Last Name   | Company                         | City              | Country                    | Phone 1                | Phone 2               | Email                       | Subscription Date   | Website                     |\n",
+    "|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|\n",
+    "|       1 | DD37Cf93aecA6Dc | Sheryl       | Baxter      | Rasmussen Group                 | East Leonard      | Chile                      | 229.077.5154           | 397.884.0519x718      | zunigavanessa@smith.info    | 2020-08-24          | http://www.stephenson.com/  |\n",
+    "|       2 | 1Ef7b82A4CAAD10 | Preston      | Lozano, Dr  | Vega-Gentry                     | East Jimmychester | Djibouti                   | 5153435776             | 686-620-1820x944      | vmata@colon.com             | 2021-04-23          | http://www.hobbs.com/       |\n",
+    "|       3 | 6F94879bDAfE5a6 | Roy          | Berry       | Murillo-Perry                   | Isabelborough     | Antigua and Barbuda        | +1-539-402-0259        | (496)978-3969x58947   | beckycarr@hogan.com         | 2020-03-25          | http://www.lawrence.com/    |\n",
+    "|       4 | 5Cef8BFA16c5e3c | Linda        | Olsen       | Dominguez, Mcmillan and Donovan | Bensonview        | Dominican Republic         | 001-808-617-6467x12895 | +1-813-324-8756       | stanleyblackwell@benson.org | 2020-06-02          | http://www.good-lyons.com/  |\n",
+    "|       5 | 053d585Ab6b3159 | Joanna       | Bender      | Martin, Lang and Andrade        | West Priscilla    | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net     | 2021-04-17          | https://goodwin-ingram.com/ |"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "docling-TtEIaPrw-py3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@@ -82,7 +82,7 @@
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "# a sample PMC article:\n",
-    "source = \"../../tests/data/pubmed/elife-56337.nxml\"\n",
+    "source = \"../../tests/data/jats/elife-56337.nxml\"\n",
    "converter = DocumentConverter()\n",
    "result = converter.convert(source)\n",
    "print(result.status)"
@@ -97,7 +97,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
@@ -106,11 +106,11 @@
     "text": [
      "# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n",
      "\n",
-      "Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States\n",
+      "Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan\n",
+      "\n",
+      "The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\n",
      "\n",
      "## Abstract\n",
-      "\n",
-      "The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.\n",
      "\n"
     ]
    }
@@ -131,7 +131,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@@ -198,7 +198,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@@ -224,7 +224,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -261,7 +261,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -313,7 +313,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@@ -359,9 +359,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip...\n",
+      "Parsing zip file, splitting into XML sections, and exporting to files...\n"
+     ]
+    }
+   ],
   "source": [
    "import zipfile\n",
    "\n",
@@ -407,7 +416,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@@ -435,7 +444,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@@ -449,7 +458,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3964d1ff30f74588a2f6b53ca8865a9f",
+       "model_id": "316241ca89a843bda3170f2a5c76c639",
       "version_major": 2,
       "version_minor": 0
      },
@@ -471,7 +480,7 @@
   "source": [
    "from tqdm.notebook import tqdm\n",
    "\n",
-    "from docling.backend.xml.pubmed_backend import PubMedDocumentBackend\n",
+    "from docling.backend.xml.jats_backend import JatsDocumentBackend\n",
    "from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n",
    "from docling.datamodel.base_models import InputFormat\n",
    "from docling.datamodel.document import InputDocument\n",
@@ -479,10 +488,10 @@
    "# check PMC\n",
    "in_doc = InputDocument(\n",
    "    path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n",
-    "    format=InputFormat.XML_PUBMED,\n",
-    "    backend=PubMedDocumentBackend,\n",
+    "    format=InputFormat.XML_JATS,\n",
+    "    backend=JatsDocumentBackend,\n",
    ")\n",
-    "backend = PubMedDocumentBackend(\n",
+    "backend = JatsDocumentBackend(\n",
    "    in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n",
    ")\n",
    "print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n",
@@ -521,7 +530,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@@ -543,7 +552,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or PubMed XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
+    "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
   ]
  },
  {
@@ -579,7 +588,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -607,7 +616,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -625,144 +634,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2025-01-24 16:49:57,108 [DEBUG][_create_connection]: Created new connection using: 2d58fad6c63448a486c0c0ffe3b7b28c (async_milvus_client.py:600)\n",
-      "Loading files:  51%|█████     | 51/100 [00:00<00:00, 67.88file/s]Input document ipg241217-1050.xml does not match any allowed format.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Failed to load file /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml with error: File format not allowed: /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml. Skipping...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading files: 100%|██████████| 100/100 [00:01<00:00, 58.05file/s]\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e9208639f1a4418d97267a28305d18fa",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Parsing nodes:   0%|          | 0/99 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "88026613f6f44f0c8476dceaa1cb78cd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7522b8b434b54616b4cfc3d71e9556d7",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5879d8161c2041f5b100959e69ff9017",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "557912b5e3c741f3a06127156bc46379",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "843bb145942b449aa55fc5b8208da734",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c7dba09a4aed422998e9b9c2c3a70317",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0bd031356c7e4e879dcbe1d04e6c4a4e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/425 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from llama_index.core import StorageContext, VectorStoreIndex\n",
    "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@@ -5,17 +5,19 @@ from pathlib import Path
 from typing import Iterable

 import yaml
+from docling_core.types.doc import ImageRefMode

 from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
 from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption

 _log = logging.getLogger(__name__)

 USE_V2 = True
-USE_LEGACY = True
+USE_LEGACY = False


 def export_documents(
@@ -34,9 +36,26 @@ def export_documents(
            doc_filename = conv_res.input.file.stem

            if USE_V2:
-                # Export Docling document format to JSON:
-                with (output_dir / f"{doc_filename}.json").open("w") as fp:
-                    fp.write(json.dumps(conv_res.document.export_to_dict()))
+                conv_res.document.save_as_json(
+                    output_dir / f"{doc_filename}.json",
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                )
+                conv_res.document.save_as_html(
+                    output_dir / f"{doc_filename}.html",
+                    image_mode=ImageRefMode.EMBEDDED,
+                )
+                conv_res.document.save_as_document_tokens(
+                    output_dir / f"{doc_filename}.doctags.txt"
+                )
+                conv_res.document.save_as_markdown(
+                    output_dir / f"{doc_filename}.md",
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                )
+                conv_res.document.save_as_markdown(
+                    output_dir / f"{doc_filename}.txt",
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                    strict_text=True,
+                )

                # Export Docling document format to YAML:
                with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
@@ -104,11 +123,10 @@ def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_paths = [
-        Path("tests/data/redp5110_sampled.pdf"),
-        # Path("./tests/data/2206.01062.pdf"),
-        # Path("./tests/data/2203.01017v2.pdf"),
-        # Path("./tests/data/2305.03393v1.pdf"),
-        # Path("./tests/data/redp5110_sampled.pdf"),
+        Path("./tests/data/pdf/2206.01062.pdf"),
+        Path("./tests/data/pdf/2203.01017v2.pdf"),
+        Path("./tests/data/pdf/2305.03393v1.pdf"),
+        Path("./tests/data/pdf/redp5110_sampled.pdf"),
    ]

    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
@@ -121,9 +139,14 @@ def main():
    # settings.debug.visualize_tables = True
    # settings.debug.visualize_cells = True

+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.generate_page_images = True
+
    doc_converter = DocumentConverter(
        format_options={
-            InputFormat.PDF: PdfFormatOption(backend=DoclingParseV3DocumentBackend)
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options, backend=DoclingParseV3DocumentBackend
+            )
        }
    )

--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")

    ###########################################################################

--- a/docs/examples/develop_formula_understanding.py
+++ b/docs/examples/develop_formula_understanding.py
@@ -1,3 +1,7 @@
+# WARNING
+# This example demonstrates only how to develop a new enrichment model.
+# It does not run the actual formula understanding model.
+
 import logging
 from pathlib import Path
 from typing import Iterable
@@ -68,7 +72,7 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2203.01017v2.pdf")
+    input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")

    pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
    pipeline_options.do_formula_understanding = True
--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@@ -1,3 +1,7 @@
+# WARNING
+# This example demonstrates only how to develop a new enrichment model.
+# It does not run the actual picture classifier model.
+
 import logging
 from pathlib import Path
 from typing import Any, Iterable
@@ -71,7 +75,7 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")

    pipeline_options = ExamplePictureClassifierPipelineOptions()
    pipeline_options.images_scale = 2.0
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@@ -16,7 +16,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@@ -19,7 +19,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@@ -12,7 +12,7 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    doc_converter = DocumentConverter()
--- a/docs/examples/full_page_ocr.py
+++ b/docs/examples/full_page_ocr.py
@@ -14,7 +14,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/2206.01062.pdf")
+    input_doc = Path("./tests/data/pdf/2206.01062.pdf")

    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@@ -83,7 +83,15 @@
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors\n"
+     ]
+    }
+   ],
   "source": [
    "from docling.chunking import HybridChunker\n",
    "\n",
@@ -91,6 +99,13 @@
    "chunk_iter = chunker.chunk(dl_doc=doc)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 👉 **NOTE**: As you see above, using the `HybridChunker` can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check [here](https://ds4sd.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model)."
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -337,11 +352,11 @@
   "source": [
    "for i, chunk in enumerate(chunks):\n",
    "    print(f\"=== {i} ===\")\n",
-    "    txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n",
+    "    txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
    "    print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
    "\n",
    "    ser_txt = chunker.serialize(chunk=chunk)\n",
-    "    ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))\n",
+    "    ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
    "    print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
    "\n",
    "    print()"
--- a/docs/examples/inspect_picture_content.py
+++ b/docs/examples/inspect_picture_content.py
@@ -4,7 +4,7 @@ from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption

-source = "tests/data/amt_handbook_sample.pdf"
+source = "tests/data/pdf/amt_handbook_sample.pdf"

 pipeline_options = PdfPipelineOptions()
 pipeline_options.images_scale = 2
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -0,0 +1,96 @@
+import json
+import time
+from pathlib import Path
+
+import yaml
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    VlmPipelineOptions,
+    granite_vision_vlm_conversion_options,
+    smoldocling_vlm_conversion_options,
+)
+from docling.datamodel.settings import settings
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.pipeline.vlm_pipeline import VlmPipeline
+
+sources = [
+    "tests/data/2305.03393v1-pg9-img.png",
+]
+
+## Use experimental VlmPipeline
+pipeline_options = VlmPipelineOptions()
+# If force_backend_text = True, text from backend will be used instead of generated text
+pipeline_options.force_backend_text = False
+
+## On GPU systems, enable flash_attention_2 with CUDA:
+# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
+# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
+
+## Pick a VLM model. We choose SmolDocling-256M by default
+pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+
+## Alternative VLM models:
+# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+
+from docling_core.types.doc import DocItemLabel, ImageRefMode
+from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
+
+## Set up pipeline for PDF or image inputs
+converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_cls=VlmPipeline,
+            pipeline_options=pipeline_options,
+        ),
+        InputFormat.IMAGE: PdfFormatOption(
+            pipeline_cls=VlmPipeline,
+            pipeline_options=pipeline_options,
+        ),
+    }
+)
+
+out_path = Path("scratch")
+out_path.mkdir(parents=True, exist_ok=True)
+
+for source in sources:
+    start_time = time.time()
+    print("================================================")
+    print("Processing... {}".format(source))
+    print("================================================")
+    print("")
+
+    res = converter.convert(source)
+
+    print("------------------------------------------------")
+    print("MD:")
+    print("------------------------------------------------")
+    print("")
+    print(res.document.export_to_markdown())
+
+    for page in res.pages:
+        print("")
+        print("Predicted page in DOCTAGS:")
+        print(page.predictions.vlm_response.text)
+
+    res.document.save_as_html(
+        filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
+        image_mode=ImageRefMode.REFERENCED,
+        labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
+    )
+
+    with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
+        fp.write(json.dumps(res.document.export_to_dict()))
+
+    pg_num = res.document.num_pages()
+
+    print("")
+    inference_time = time.time() - start_time
+    print(
+        f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
+    )
+
+print("================================================")
+print("done!")
+print("================================================")
--- a/docs/examples/pictures_description.ipynb
+++ b/docs/examples/pictures_description.ipynb
--- a/docs/examples/pictures_description_api.py
+++ b/docs/examples/pictures_description_api.py
@@ -0,0 +1,118 @@
+import logging
+import os
+from pathlib import Path
+
+import requests
+from docling_core.types.doc import PictureItem
+from dotenv import load_dotenv
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    PictureDescriptionApiOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def vllm_local_options(model: str):
+    options = PictureDescriptionApiOptions(
+        url="http://localhost:8000/v1/chat/completions",
+        params=dict(
+            model=model,
+            seed=42,
+            max_completion_tokens=200,
+        ),
+        prompt="Describe the image in three sentences. Be consise and accurate.",
+        timeout=90,
+    )
+    return options
+
+
+def watsonx_vlm_options():
+    load_dotenv()
+    api_key = os.environ.get("WX_API_KEY")
+    project_id = os.environ.get("WX_PROJECT_ID")
+
+    def _get_iam_access_token(api_key: str) -> str:
+        res = requests.post(
+            url="https://iam.cloud.ibm.com/identity/token",
+            headers={
+                "Content-Type": "application/x-www-form-urlencoded",
+            },
+            data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}",
+        )
+        res.raise_for_status()
+        api_out = res.json()
+        print(f"{api_out=}")
+        return api_out["access_token"]
+
+    options = PictureDescriptionApiOptions(
+        url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
+        params=dict(
+            model_id="meta-llama/llama-3-2-11b-vision-instruct",
+            project_id=project_id,
+            parameters=dict(
+                max_new_tokens=400,
+            ),
+        ),
+        headers={
+            "Authorization": "Bearer " + _get_iam_access_token(api_key=api_key),
+        },
+        prompt="Describe the image in three sentences. Be consise and accurate.",
+        timeout=60,
+    )
+    return options
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+
+    pipeline_options = PdfPipelineOptions(
+        enable_remote_services=True  # <-- this is required!
+    )
+    pipeline_options.do_picture_description = True
+
+    # The PictureDescriptionApiOptions() allows to interface with APIs supporting
+    # the multi-modal chat interface. Here follow a few example on how to configure those.
+    #
+    # One possibility is self-hosting model, e.g. via VLLM.
+    # $ vllm serve MODEL_NAME
+    # Then PictureDescriptionApiOptions can point to the localhost endpoint.
+    #
+    # Example for the Granite Vision model: (uncomment the following lines)
+    # pipeline_options.picture_description_options = vllm_local_options(
+    #     model="ibm-granite/granite-vision-3.1-2b-preview"
+    # )
+    #
+    # Example for the SmolVLM model: (uncomment the following lines)
+    pipeline_options.picture_description_options = vllm_local_options(
+        model="HuggingFaceTB/SmolVLM-256M-Instruct"
+    )
+    #
+    # Another possibility is using online services, e.g. watsonx.ai.
+    # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
+    # Uncomment the following line for this option:
+    # pipeline_options.picture_description_options = watsonx_vlm_options()
+
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+    result = doc_converter.convert(input_doc_path)
+
+    for element, _level in result.document.iterate_items():
+        if isinstance(element, PictureItem):
+            print(
+                f"Picture {element.self_ref}\n"
+                f"Caption: {element.caption_text(doc=result.document)}\n"
+                f"Annotations: {element.annotations}"
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@@ -14,7 +14,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/2206.01062.pdf")
+    input_doc = Path("./tests/data/pdf/2206.01062.pdf")

    # Explicitly set the accelerator
    # accelerator_options = AcceleratorOptions(
@@ -30,6 +30,9 @@ def main():
    #     num_threads=8, device=AcceleratorDevice.CUDA
    # )

+    # easyocr doesnt support cuda:N allocation, defaults to cuda:0
+    # accelerator_options = AcceleratorOptions(num_threads=8, device="cuda:1")
+
    pipeline_options = PdfPipelineOptions()
    pipeline_options.accelerator_options = accelerator_options
    pipeline_options.do_ocr = True
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@@ -25,9 +25,8 @@ def main():
        Path("tests/data/docx/lorem_ipsum.docx"),
        Path("tests/data/pptx/powerpoint_sample.pptx"),
        Path("tests/data/2305.03393v1-pg9-img.png"),
-        Path("tests/data/2206.01062.pdf"),
-        Path("tests/data/test_01.asciidoc"),
-        Path("tests/data/test_01.asciidoc"),
+        Path("tests/data/pdf/2206.01062.pdf"),
+        Path("tests/data/asciidoc/test_01.asciidoc"),
    ]

    ## for defaults use:
@@ -44,6 +43,7 @@ def main():
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
+                InputFormat.CSV,
                InputFormat.MD,
            ],  # whitelist formats, non-matching files are ignored.
            format_options={
--- a/docs/examples/tesseract_lang_detection.py
+++ b/docs/examples/tesseract_lang_detection.py
@@ -10,7 +10,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/2206.01062.pdf")
+    input_doc = Path("./tests/data/pdf/2206.01062.pdf")

    # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
    # ocr_options = TesseractOcrOptions(lang=["auto"])
--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@@ -32,7 +32,7 @@ def translate(text: str, src: str = "en", dest: str = "de"):
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter