feat(docx): Process drawingml objects in docx (#2453)

* Export of DrawingML figures into docling document * Adding libreoffice env var and libreoffice to checks image Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * DCO Remediation Commit for Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> I, Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>, hereby add my Signed-off-by to this commit: 9518fffcad Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * Enforcing apt get update Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * Only display drawingml warning once per document Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * add util to test libreoffice and exclude files from test when not found Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * check libreoffice only once Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Only initialise converter if needed Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> --------- Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-10-15 10:58:08 +02:00
parent 3e6da2c62d
commit 16829939cf
8 changed files with 512 additions and 25 deletions
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -80,10 +80,8 @@ jobs:

        - name: Install System Dependencies
          run: |
-            if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
            sudo apt-get -qq update
-            fi
-            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
+            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config

        - name: Set TESSDATA_PREFIX
          run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -149,10 +147,8 @@ jobs:

        - name: Install System Dependencies
          run: |
-            if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
            sudo apt-get -qq update
-            fi
-            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
+            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config

        - name: Set TESSDATA_PREFIX
          run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -223,10 +219,8 @@ jobs:

        - name: Install System Dependencies
          run: |
-            if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
            sudo apt-get -qq update
-            fi
-            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
+            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config

        - name: Set TESSDATA_PREFIX
          run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
--- a/docling/backend/docx/drawingml/utils.py
+++ b/docling/backend/docx/drawingml/utils.py
@@ -0,0 +1,131 @@
+import os
+import shutil
+import subprocess
+from pathlib import Path
+from tempfile import mkdtemp
+from typing import Callable, Optional
+
+import pypdfium2
+from docx.document import Document
+from PIL import Image, ImageChops
+
+
+def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
+    """Return the libreoffice cmd and optionally test it."""
+
+    libreoffice_cmd = (
+        shutil.which("libreoffice")
+        or shutil.which("soffice")
+        or (
+            "/Applications/LibreOffice.app/Contents/MacOS/soffice"
+            if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
+            else None
+        )
+    )
+
+    if raise_if_unavailable:
+        if libreoffice_cmd is None:
+            raise RuntimeError("Libreoffice not found")
+
+        # The following test will raise if the libreoffice_cmd cannot be used
+        subprocess.run(
+            [
+                libreoffice_cmd,
+                "-h",
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            check=True,
+        )
+
+    return libreoffice_cmd
+
+
+def get_docx_to_pdf_converter() -> Optional[Callable]:
+    """
+    Detects the best available DOCX to PDF tool and returns a conversion function.
+    The returned function accepts (input_path, output_path).
+    Returns None if no tool is available.
+    """
+
+    # Try LibreOffice
+    libreoffice_cmd = get_libreoffice_cmd()
+
+    if libreoffice_cmd:
+
+        def convert_with_libreoffice(input_path, output_path):
+            subprocess.run(
+                [
+                    libreoffice_cmd,
+                    "--headless",
+                    "--convert-to",
+                    "pdf",
+                    "--outdir",
+                    os.path.dirname(output_path),
+                    input_path,
+                ],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=True,
+            )
+
+            expected_output = os.path.join(
+                os.path.dirname(output_path),
+                os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
+            )
+            if expected_output != output_path:
+                os.rename(expected_output, output_path)
+
+        return convert_with_libreoffice
+
+    ## Space for other DOCX to PDF converters if available
+
+    # No tools found
+    return None
+
+
+def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
+    if bg_color is None:
+        bg_color = image.getpixel((0, 0))
+
+    bg = Image.new(image.mode, image.size, bg_color)
+    diff = ImageChops.difference(image, bg)
+    bbox = diff.getbbox()
+
+    if bbox:
+        left, upper, right, lower = bbox
+        left = max(0, left - padding)
+        upper = max(0, upper - padding)
+        right = min(image.width, right + padding)
+        lower = min(image.height, lower + padding)
+        return image.crop((left, upper, right, lower))
+    else:
+        return image
+
+
+def get_pil_from_dml_docx(
+    docx: Document, converter: Optional[Callable]
+) -> Optional[Image.Image]:
+    if converter is None:
+        return None
+
+    temp_dir = Path(mkdtemp())
+    temp_docx = Path(temp_dir / "drawing_only.docx")
+    temp_pdf = Path(temp_dir / "drawing_only.pdf")
+
+    # 1) Save docx temporarily
+    docx.save(str(temp_docx))
+
+    # 2) Export to PDF
+    converter(temp_docx, temp_pdf)
+
+    # 3) Load PDF as PNG
+    pdf = pypdfium2.PdfDocument(temp_pdf)
+    page = pdf[0]
+    image = crop_whitespace(page.render(scale=2).to_pil())
+    page.close()
+    pdf.close()
+
+    shutil.rmtree(temp_dir, ignore_errors=True)
+
+    return image
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -1,8 +1,9 @@
 import logging
 import re
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union

 from docling_core.types.doc import (
    DocItemLabel,
@@ -33,6 +34,11 @@ from pydantic import AnyUrl
 from typing_extensions import override

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.docx.drawingml.utils import (
+    get_docx_to_pdf_converter,
+    get_libreoffice_cmd,
+    get_pil_from_dml_docx,
+)
 from docling.backend.docx.latex.omml import oMath2Latex
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@@ -64,6 +70,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.equation_bookends: str = "<eq>{EQ}</eq>"
        # Track processed textbox elements to avoid duplication
        self.processed_textbox_elements: List[int] = []
+        self.docx_to_pdf_converter: Optional[Callable] = None
+        self.docx_to_pdf_converter_init = False
+        self.display_drawingml_warning = True

        for i in range(-1, self.max_levels):
            self.parents[i] = None
@@ -80,18 +89,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            "indents": [None],
        }

-        self.docx_obj = None
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.docx_obj = Document(self.path_or_stream)
-            elif isinstance(self.path_or_stream, Path):
-                self.docx_obj = Document(str(self.path_or_stream))
-
+        self.docx_obj = self.load_msword_file(
+            path_or_stream=self.path_or_stream, document_hash=self.document_hash
+        )
+        if self.docx_obj:
            self.valid = True
-        except Exception as e:
-            raise RuntimeError(
-                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
-            ) from e

    @override
    def is_valid(self) -> bool:
@@ -139,6 +141,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
            )

+    @staticmethod
+    def load_msword_file(
+        path_or_stream: Union[BytesIO, Path], document_hash: str
+    ) -> DocxDocument:
+        try:
+            if isinstance(path_or_stream, BytesIO):
+                return Document(path_or_stream)
+            elif isinstance(path_or_stream, Path):
+                return Document(str(path_or_stream))
+            else:
+                return None
+        except Exception as e:
+            raise RuntimeError(
+                f"MsWordDocumentBackend could not load document with hash {document_hash}"
+            ) from e
+
    def _update_history(
        self,
        name: str,
@@ -195,6 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            }
            xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
            drawing_blip = xpath_expr(element)
+            drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)

            # Check for textbox content - check multiple textbox formats
            # Only process if the element hasn't been processed before
@@ -274,6 +293,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                ):
                    te1 = self._handle_text_elements(element, docx_obj, doc)
                    added_elements.extend(te1)
+            # Check for DrawingML elements
+            elif drawingml_els:
+                if (
+                    self.docx_to_pdf_converter is None
+                    and self.docx_to_pdf_converter_init is False
+                ):
+                    self.docx_to_pdf_converter = get_docx_to_pdf_converter()
+                    self.docx_to_pdf_converter_init = True
+
+                if self.docx_to_pdf_converter is None:
+                    if self.display_drawingml_warning:
+                        if self.docx_to_pdf_converter is None:
+                            _log.warning(
+                                "Found DrawingML elements in document, but no DOCX to PDF converters. "
+                                "If you want these exported, make sure you have "
+                                "LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
+                            )
+                            self.display_drawingml_warning = False
+                else:
+                    self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
            # Check for the sdt containers, like table of contents
            elif tag_name in ["sdt"]:
                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -1381,3 +1420,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                )
                elem_ref.append(p3.get_ref())
        return elem_ref
+
+    def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
+        # 1) Make an empty copy of the original document
+        dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
+        body = dml_doc._element.body
+        for child in list(body):
+            body.remove(child)
+
+        # 2) Add DrawingML to empty document
+        new_para = dml_doc.add_paragraph()
+        new_r = new_para.add_run()
+        for dml in drawingml_els:
+            new_r._r.append(deepcopy(dml))
+
+        # 3) Export DOCX->PDF->PNG and save it in DoclingDocument
+        level = self._get_level()
+        try:
+            pil_image = get_pil_from_dml_docx(
+                dml_doc, converter=self.docx_to_pdf_converter
+            )
+            if pil_image is None:
+                raise UnidentifiedImageError
+
+            doc.add_picture(
+                parent=self.parents[level - 1],
+                image=ImageRef.from_pil(image=pil_image, dpi=72),
+                caption=None,
+            )
+        except (UnidentifiedImageError, OSError):
+            _log.warning("Warning: DrawingML image cannot be loaded by Pillow")
+            doc.add_picture(
+                parent=self.parents[level - 1],
+                caption=None,
+            )
+
+        return
--- a/tests/data/docx/drawingml.docx
+++ b/tests/data/docx/drawingml.docx
--- a/tests/data/groundtruth/docling_v2/drawingml.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/drawingml.docx.itxt
@@ -0,0 +1,13 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group textbox
+    item-2 at level 2: text: Text 2
+    item-3 at level 2: text: Text 1
+  item-4 at level 1: picture
+  item-5 at level 1: text: 
+  item-6 at level 1: text: 
+  item-7 at level 1: text: 
+  item-8 at level 1: text: 
+  item-9 at level 1: text: 
+  item-10 at level 1: text: 
+  item-11 at level 1: text: 
+  item-12 at level 1: picture
--- a/tests/data/groundtruth/docling_v2/drawingml.docx.json
+++ b/tests/data/groundtruth/docling_v2/drawingml.docx.json
--- a/tests/data/groundtruth/docling_v2/drawingml.docx.md
+++ b/tests/data/groundtruth/docling_v2/drawingml.docx.md
@@ -0,0 +1,7 @@
+Text 2
+
+Text 1
+
+<!-- image -->
+
+<!-- image -->
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@@ -1,7 +1,9 @@
+import os
 from pathlib import Path

 import pytest

+from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import (
@@ -17,6 +19,7 @@ from .test_data_gen_flag import GEN_TEST_DATA
 from .verify_utils import verify_document, verify_export

 GENERATE = GEN_TEST_DATA
+IS_CI = bool(os.getenv("CI"))


@pytest.mark.xfail(strict=False)
@@ -84,8 +87,22 @@ def get_converter():
 def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
    converter = get_converter()

+    has_libreoffice = False
+    try:
+        cmd = get_libreoffice_cmd(raise_if_unavailable=True)
+        if cmd is not None:
+            has_libreoffice = True
+    except Exception:
+        pass
+
    for docx_path in docx_paths:
-        # print(f"converting {docx_path}")
+        if (
+            not IS_CI
+            and not has_libreoffice
+            and str(docx_path) in ("tests/data/docx/drawingml.docx",)
+        ):
+            print(f"Skipping {docx_path} because no Libreoffice is installed.")
+            continue

        gt_path = (
            docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name