Added Custom Serializer for Table enrichment

Signed-off-by: Nikhil Khandelwal <nikhil.khandelwal3@ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-05-15 00:11:00 +05:30 · 2025-05-15 00:11:00 +05:30 · 540610e4dc
commit 540610e4dc
parent f2c019cad7
1 changed files with 238 additions and 13 deletions
--- a/docs/examples/serialization.ipynb
+++ b/docs/examples/serialization.ipynb
@ -47,7 +47,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -60,7 +60,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -90,14 +90,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/Users/pva/work/github.com/DS4SD/docling/.venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
+      "c:\\Users\\NikhilKhandelwal3\\Desktop\\Electrolux\\Pipelines\\doc\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "c:\\Users\\NikhilKhandelwal3\\Desktop\\Electrolux\\Pipelines\\doc\\Lib\\site-packages\\torch\\utils\\data\\dataloader.py:665: UserWarning: 'pin_memory' argument is set as true but no accelerator is found, then device pinned memory won't be used.\n",
      "  warnings.warn(warn_msg)\n"
     ]
    }
@ -122,7 +124,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -197,7 +199,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@ -206,6 +208,8 @@
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
       "│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.                                                                                              │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ &lt;!-- table --&gt;                                                                                                                                                                                                 │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ | Report         | Question                                                         | Answer                                                                                                          |        │\n",
       "│ |----------------|------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|        │\n",
       "│ | IBM 2022       | How many hours were spent on employee learning in 2021?          | 22.5 million hours                                                                                              |        │\n",
@ -243,6 +247,8 @@
       "╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
       "│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.                                                                                              │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ <!-- table -->                                                                                                                                                                                                 │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ | Report         | Question                                                         | Answer                                                                                                          |        │\n",
       "│ |----------------|------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|        │\n",
       "│ | IBM 2022       | How many hours were spent on employee learning in 2021?          | 22.5 million hours                                                                                              |        │\n",
@ -310,7 +316,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@ -428,14 +434,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/Users/pva/work/github.com/DS4SD/docling/.venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
+      "c:\\Users\\NikhilKhandelwal3\\Desktop\\Electrolux\\Pipelines\\doc\\Lib\\site-packages\\torch\\utils\\data\\dataloader.py:665: UserWarning: 'pin_memory' argument is set as true but no accelerator is found, then device pinned memory won't be used.\n",
      "  warnings.warn(warn_msg)\n"
     ]
    }
@ -473,7 +479,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -540,7 +546,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -549,6 +555,8 @@
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
       "│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.                                                                                              │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ &lt;!-- table --&gt;                                                                                                                                                                                                 │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ | Report         | Question                                                         | Answer                                                                                                          |        │\n",
       "│ |----------------|------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|        │\n",
       "│ | IBM 2022       | How many hours were spent on employee learning in 2021?          | 22.5 million hours                                                                                              |        │\n",
@ -587,6 +595,8 @@
       "╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
       "│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.                                                                                              │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ <!-- table -->                                                                                                                                                                                                 │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ | Report         | Question                                                         | Answer                                                                                                          |        │\n",
       "│ |----------------|------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|        │\n",
       "│ | IBM 2022       | How many hours were spent on employee learning in 2021?          | 22.5 million hours                                                                                              |        │\n",
@ -639,11 +649,226 @@
    "\n",
    "print_in_console(ser_text[ser_text.find(start_cue) : ser_text.find(stop_cue)])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creating a custom serializer for Table description enrichment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from docling.datamodel.base_models import InputFormat\n",
    "from docling.datamodel.pipeline_options import (\n",
    "    PdfPipelineOptions,\n",
    "    PictureDescriptionVlmOptions,\n",
    ")\n",
    "from docling.document_converter import DocumentConverter, PdfFormatOption\n",
    "\n",
    "pipeline_options = PdfPipelineOptions(\n",
    "    do_table_description=True,\n",
    "    generate_table_images=True,\n",
    "    images_scale=2,\n",
    ")\n",
    "\n",
    "converter = DocumentConverter(\n",
    "    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}\n",
    ")\n",
    "doc = converter.convert(source=DOC_SOURCE).document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Any, Optional\n",
    "\n",
    "from docling_core.transforms.serializer.base import (\n",
    "    BaseDocSerializer,\n",
    "    SerializationResult,\n",
    ")\n",
    "from docling_core.transforms.serializer.common import create_ser_result\n",
    "from docling_core.transforms.serializer.markdown import (\n",
    "    MarkdownParams,\n",
    "    MarkdownTableSerializer\n",
    ")\n",
    "from docling_core.types.doc.document import (\n",
    "    DoclingDocument,\n",
    "    ImageRefMode,\n",
    "    PictureDescriptionData,\n",
    "    TableItem\n",
    ")\n",
    "from typing_extensions import override\n",
    "\n",
    "\n",
    "class AnnotationTableSerializer(MarkdownTableSerializer):\n",
    "    @override\n",
    "    def serialize(\n",
    "        self,\n",
    "        *,\n",
    "        item: TableItem,\n",
    "        doc_serializer: BaseDocSerializer,\n",
    "        doc: DoclingDocument,\n",
    "        separator: Optional[str] = None,\n",
    "        **kwargs: Any,\n",
    "    ) -> SerializationResult:\n",
    "        text_parts: list[str] = []\n",
    "\n",
    "        # reusing the existing result:\n",
    "        parent_res = super().serialize(\n",
    "            item=item,\n",
    "            doc_serializer=doc_serializer,\n",
    "            doc=doc,\n",
    "            **kwargs,\n",
    "        )\n",
    "        text_parts.append(parent_res.text)\n",
    "\n",
    "        # appending annotations:\n",
    "        for annotation in item.annotations:\n",
    "            if isinstance(annotation, PictureDescriptionData):\n",
    "                text_parts.append(f\"\\n<!-- Table description: {annotation.text} -->\")\n",
    "\n",
    "        text_res = (separator or \"\\n\").join(text_parts)\n",
    "        return create_ser_result(text=text_res, span_source=item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
       "│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.                                                                                              │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ | Report         | Question                                                         | Answer                                                                                                          |        │\n",
       "│ |----------------|------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|        │\n",
       "│ | IBM 2022       | How many hours were spent on employee learning in 2021?          | 22.5 million hours                                                                                              |        │\n",
       "│ | IBM 2022       | What was the rate of fatalities in 2021?                         | The rate of fatalities in 2021 was 0.0016.                                                                      |        │\n",
       "│ | IBM 2022       | How many full audits were con- ducted in 2022 in India?          | 2                                                                                                               |        │\n",
       "│ | Starbucks 2022 | What is the percentage of women in the Board of Directors?       | 25%                                                                                                             |        │\n",
       "│ | Starbucks 2022 | What was the total energy con- sumption in 2021?                 | According to the table, the total energy consumption in 2021 was 2,491,543 MWh.                                 |        │\n",
       "│ | Starbucks 2022 | How much packaging material was made from renewable mate- rials? | According to the given data, 31% of packaging materials were made from recycled or renewable materials in FY22. |        │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ &lt;!-- Table description: The image is a bar chart that shows the number of hours spent on employee learning in 2021. The x-axis represents the number of hours, ranging from 0 to 22.5 million, while the       │\n",
       "│ y-axis represents the number of hours, ranging from 0 to 22.5 million. The chart shows that the number of hours spent on employee learning increased from 2021 to 2022, with the highest number of hours spent │\n",
       "│ on employee learning in 2022.                                                                                                                                                                                  │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ The bar chart is visually structured with the x-axis labeled \"Number of Hours\" and the y-axis labeled \"Number of Hours\". The x-axis is labeled \"Number of Hours\" and the y-axis is labeled \"Number of Hours\".  │\n",
       "│ The chart shows that the number of hours spent on employee learning increased from 2021 to 2022.                                                                                                               │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ The bar chart is visually structured with the --&gt;                                                                                                                                                              │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ Table 1: Example question answers from the ESG reports of IBM and Starbucks using Deep Search DocQA system.                                                                                                    │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ ESG report in our library via our QA conversational assistant. Our assistant generates answers and also presents the information (paragraph or table), in the ESG report, from which it has generated the      │\n",
       "│ response.                                                                                                                                                                                                      │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ ## Related Work                                                                                                                                                                                                │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ The DocQA integrates multiple AI technologies, namely:                                                                                                                                                         │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ Document Conversion: Converting unstructured documents, such as PDF files, into a machine-readable format is a challenging task in AI. Early strategies for document conversion were based on geometric layout │\n",
       "│ analysis (Cattoni et al. 2000; Breuel 2002). Thanks to the availability of large annotated datasets (PubLayNet (Zhong et al. 2019), DocBank (Li et al. 2020), DocLayNet (Pfitzmann et al. 2022; Auer et al.    │\n",
       "│ 2023), deep learning-based methods are routinely used. Modern approaches for recovering the structure of a document can be broadly divided into two categories: image-based or PDF representation-based .      │\n",
       "│ Imagebased methods usually employ Transformer or CNN architectures on the images of pages (Zhang et al. 2023; Li et al. 2022; Huang et al. 2022). On the other hand, deep learning-                            │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ Figure 1: System architecture: Simplified sketch of document question-answering pipeline.                                                                                                                      │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ &lt;!-- image --&gt;                                                                                                                                                                                                 │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ based language processing methods are applied on the native PDF content (generated by a single PDF printing command) (Auer et al. 2022; Livathinos et al. 2021; Staar et al. 2018).                            │\n",
       "│                                                                                                                                                                                                                │\n",
       "│                                                                                                                                                                                                                │\n",
       "╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
       "</pre>\n"
      ],
      "text/plain": [
       "╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
       "│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.                                                                                              │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ | Report         | Question                                                         | Answer                                                                                                          |        │\n",
       "│ |----------------|------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|        │\n",
       "│ | IBM 2022       | How many hours were spent on employee learning in 2021?          | 22.5 million hours                                                                                              |        │\n",
       "│ | IBM 2022       | What was the rate of fatalities in 2021?                         | The rate of fatalities in 2021 was 0.0016.                                                                      |        │\n",
       "│ | IBM 2022       | How many full audits were con- ducted in 2022 in India?          | 2                                                                                                               |        │\n",
       "│ | Starbucks 2022 | What is the percentage of women in the Board of Directors?       | 25%                                                                                                             |        │\n",
       "│ | Starbucks 2022 | What was the total energy con- sumption in 2021?                 | According to the table, the total energy consumption in 2021 was 2,491,543 MWh.                                 |        │\n",
       "│ | Starbucks 2022 | How much packaging material was made from renewable mate- rials? | According to the given data, 31% of packaging materials were made from recycled or renewable materials in FY22. |        │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ <!-- Table description: The image is a bar chart that shows the number of hours spent on employee learning in 2021. The x-axis represents the number of hours, ranging from 0 to 22.5 million, while the       │\n",
       "│ y-axis represents the number of hours, ranging from 0 to 22.5 million. The chart shows that the number of hours spent on employee learning increased from 2021 to 2022, with the highest number of hours spent │\n",
       "│ on employee learning in 2022.                                                                                                                                                                                  │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ The bar chart is visually structured with the x-axis labeled \"Number of Hours\" and the y-axis labeled \"Number of Hours\". The x-axis is labeled \"Number of Hours\" and the y-axis is labeled \"Number of Hours\".  │\n",
       "│ The chart shows that the number of hours spent on employee learning increased from 2021 to 2022.                                                                                                               │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ The bar chart is visually structured with the -->                                                                                                                                                              │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ Table 1: Example question answers from the ESG reports of IBM and Starbucks using Deep Search DocQA system.                                                                                                    │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ ESG report in our library via our QA conversational assistant. Our assistant generates answers and also presents the information (paragraph or table), in the ESG report, from which it has generated the      │\n",
       "│ response.                                                                                                                                                                                                      │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ ## Related Work                                                                                                                                                                                                │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ The DocQA integrates multiple AI technologies, namely:                                                                                                                                                         │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ Document Conversion: Converting unstructured documents, such as PDF files, into a machine-readable format is a challenging task in AI. Early strategies for document conversion were based on geometric layout │\n",
       "│ analysis (Cattoni et al. 2000; Breuel 2002). Thanks to the availability of large annotated datasets (PubLayNet (Zhong et al. 2019), DocBank (Li et al. 2020), DocLayNet (Pfitzmann et al. 2022; Auer et al.    │\n",
       "│ 2023), deep learning-based methods are routinely used. Modern approaches for recovering the structure of a document can be broadly divided into two categories: image-based or PDF representation-based .      │\n",
       "│ Imagebased methods usually employ Transformer or CNN architectures on the images of pages (Zhang et al. 2023; Li et al. 2022; Huang et al. 2022). On the other hand, deep learning-                            │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ Figure 1: System architecture: Simplified sketch of document question-answering pipeline.                                                                                                                      │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ <!-- image -->                                                                                                                                                                                                 │\n",
       "│                                                                                                                                                                                                                │\n",
       "│ based language processing methods are applied on the native PDF content (generated by a single PDF printing command) (Auer et al. 2022; Livathinos et al. 2021; Staar et al. 2018).                            │\n",
       "│                                                                                                                                                                                                                │\n",
       "│                                                                                                                                                                                                                │\n",
       "╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from docling_core.transforms.serializer.markdown import MarkdownDocSerializer\n",
    "\n",
    "serializer = MarkdownDocSerializer(\n",
    "    doc=doc,\n",
    "    table_serializer=AnnotationTableSerializer(),\n",
    "    params=MarkdownParams(\n",
    "        table_placeholder=\"\",\n",
    "    ),\n",
    ")\n",
    "ser_result = serializer.serialize()\n",
    "ser_text = ser_result.text\n",
    "\n",
    "print_in_console(ser_text[ser_text.find(start_cue) : ser_text.find(stop_cue)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "doc",
   "language": "python",
   "name": "python3"
  },
@ -657,7 +882,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.13.2"
+   "version": "3.12.10"
  }
 },
 "nbformat": 4,