docs: improve docs on token limit warning triggered by HybridChunker (#1077)

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2025-02-28 14:54:46 +01:00
parent de7b963b09
commit db3ceefd4a
2 changed files with 27 additions and 9 deletions
--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@@ -83,7 +83,15 @@
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors\n"
+     ]
+    }
+   ],
   "source": [
    "from docling.chunking import HybridChunker\n",
    "\n",
@@ -91,6 +99,13 @@
    "chunk_iter = chunker.chunk(dl_doc=doc)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 👉 **NOTE**: As you see above, using the `HybridChunker` can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check [here](https://ds4sd.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model)."
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -337,11 +352,11 @@
   "source": [
    "for i, chunk in enumerate(chunks):\n",
    "    print(f\"=== {i} ===\")\n",
-    "    txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n",
+    "    txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
    "    print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
    "\n",
    "    ser_txt = chunker.serialize(chunk=chunk)\n",
-    "    ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))\n",
+    "    ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
    "    print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
    "\n",
    "    print()"