Export of DrawingML figures into docling document

2025-12-08 12:48:28 +00:00 · 2025-10-13 15:02:34 +02:00
parent 10165dda8a
commit 9518fffcad
6 changed files with 443 additions and 11 deletions
--- a/docling/backend/docx/drawingml/utils.py
+++ b/docling/backend/docx/drawingml/utils.py
@@ -0,0 +1,99 @@
+import os
+import shutil
+import subprocess
+from pathlib import Path
+from tempfile import mkdtemp
+from typing import Callable, Optional
+
+import pypdfium2
+from docx.document import Document
+from PIL import Image, ImageChops
+
+
+def get_docx_to_pdf_converter() -> Optional[Callable]:
+    """
+    Detects the best available DOCX to PDF tool and returns a conversion function.
+    The returned function accepts (input_path, output_path).
+    Returns None if no tool is available.
+    """
+
+    # Try LibreOffice
+    libreoffice_cmd = shutil.which("libreoffice") or shutil.which("soffice")
+    if libreoffice_cmd:
+
+        def convert_with_libreoffice(input_path, output_path):
+            subprocess.run(
+                [
+                    libreoffice_cmd,
+                    "--headless",
+                    "--convert-to",
+                    "pdf",
+                    "--outdir",
+                    os.path.dirname(output_path),
+                    input_path,
+                ],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=True,
+            )
+
+            expected_output = os.path.join(
+                os.path.dirname(output_path),
+                os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
+            )
+            if expected_output != output_path:
+                os.rename(expected_output, output_path)
+
+        return convert_with_libreoffice
+
+    ## Space for other DOCX to PDF converters if available
+
+    # No tools found
+    return None
+
+
+def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
+    if bg_color is None:
+        bg_color = image.getpixel((0, 0))
+
+    bg = Image.new(image.mode, image.size, bg_color)
+    diff = ImageChops.difference(image, bg)
+    bbox = diff.getbbox()
+
+    if bbox:
+        left, upper, right, lower = bbox
+        left = max(0, left - padding)
+        upper = max(0, upper - padding)
+        right = min(image.width, right + padding)
+        lower = min(image.height, lower + padding)
+        return image.crop((left, upper, right, lower))
+    else:
+        return image
+
+
+def get_pil_from_dml_docx(
+    docx: Document, converter: Optional[Callable]
+) -> Optional[Image.Image]:
+    if converter is None:
+        return None
+
+    temp_dir = Path(mkdtemp())
+    temp_docx = Path(temp_dir / "drawing_only.docx")
+    temp_pdf = Path(temp_dir / "drawing_only.pdf")
+
+    # 1) Save docx temporarily
+    docx.save(str(temp_docx))
+
+    # 2) Export to PDF
+    converter(temp_docx, temp_pdf)
+
+    # 3) Load PDF as PNG
+    pdf = pypdfium2.PdfDocument(temp_pdf)
+    page = pdf[0]
+    image = crop_whitespace(page.render(scale=2).to_pil())
+    page.close()
+    pdf.close()
+
+    shutil.rmtree(temp_dir, ignore_errors=True)
+
+    return image
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -1,5 +1,6 @@
 import logging
 import re
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import Any, List, Optional, Union
@@ -33,6 +34,10 @@ from pydantic import AnyUrl
 from typing_extensions import override

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.docx.drawingml.utils import (
+    get_docx_to_pdf_converter,
+    get_pil_from_dml_docx,
+)
 from docling.backend.docx.latex.omml import oMath2Latex
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@@ -64,6 +69,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.equation_bookends: str = "<eq>{EQ}</eq>"
        # Track processed textbox elements to avoid duplication
        self.processed_textbox_elements: List[int] = []
+        # Get docx 2 pdf converter if available
+        self.docx_to_pdf_converter = get_docx_to_pdf_converter()

        for i in range(-1, self.max_levels):
            self.parents[i] = None
@@ -80,18 +87,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            "indents": [None],
        }

-        self.docx_obj = None
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.docx_obj = Document(self.path_or_stream)
-            elif isinstance(self.path_or_stream, Path):
-                self.docx_obj = Document(str(self.path_or_stream))
-
+        self.docx_obj = self.load_msword_file(
+            path_or_stream=self.path_or_stream, document_hash=self.document_hash
+        )
+        if self.docx_obj:
            self.valid = True
-        except Exception as e:
-            raise RuntimeError(
-                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
-            ) from e

    @override
    def is_valid(self) -> bool:
@@ -139,6 +139,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
            )

+    @staticmethod
+    def load_msword_file(
+        path_or_stream: Union[BytesIO, Path], document_hash: str
+    ) -> DocxDocument:
+        try:
+            if isinstance(path_or_stream, BytesIO):
+                return Document(path_or_stream)
+            elif isinstance(path_or_stream, Path):
+                return Document(str(path_or_stream))
+            else:
+                return None
+        except Exception as e:
+            raise RuntimeError(
+                f"MsWordDocumentBackend could not load document with hash {document_hash}"
+            ) from e
+
    def _update_history(
        self,
        name: str,
@@ -195,6 +211,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            }
            xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
            drawing_blip = xpath_expr(element)
+            drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)

            # Check for textbox content - check multiple textbox formats
            # Only process if the element hasn't been processed before
@@ -274,6 +291,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                ):
                    te1 = self._handle_text_elements(element, docx_obj, doc)
                    added_elements.extend(te1)
+            # Check for DrawingML elements
+            elif drawingml_els:
+                if self.docx_to_pdf_converter is None:
+                    _log.warning(
+                        "Found DrawingML elements in document, but no DOCX to PDF converters. "
+                        "If you want these exported, make sure you have "
+                        "LibreOffice binary in PATH. "
+                    )
+                else:
+                    self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
            # Check for the sdt containers, like table of contents
            elif tag_name in ["sdt"]:
                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -1381,3 +1408,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                )
                elem_ref.append(p3.get_ref())
        return elem_ref
+
+    def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
+        # 1) Make an empty copy of the original document
+        dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
+        body = dml_doc._element.body
+        for child in list(body):
+            body.remove(child)
+
+        # 2) Add DrawingML to empty document
+        new_para = dml_doc.add_paragraph()
+        new_r = new_para.add_run()
+        for dml in drawingml_els:
+            new_r._r.append(deepcopy(dml))
+
+        # 3) Export DOCX->PDF->PNG and save it in DoclingDocument
+        level = self._get_level()
+        try:
+            pil_image = get_pil_from_dml_docx(
+                dml_doc, converter=self.docx_to_pdf_converter
+            )
+            if pil_image is None:
+                raise UnidentifiedImageError
+
+            doc.add_picture(
+                parent=self.parents[level - 1],
+                image=ImageRef.from_pil(image=pil_image, dpi=72),
+                caption=None,
+            )
+        except (UnidentifiedImageError, OSError):
+            _log.warning("Warning: DrawingML image cannot be loaded by Pillow")
+            doc.add_picture(
+                parent=self.parents[level - 1],
+                caption=None,
+            )
+
+        return
--- a/tests/data/docx/drawingml.docx
+++ b/tests/data/docx/drawingml.docx
--- a/tests/data/groundtruth/docling_v2/drawingml.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/drawingml.docx.itxt
@@ -0,0 +1,13 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group textbox
+    item-2 at level 2: text: Text 2
+    item-3 at level 2: text: Text 1
+  item-4 at level 1: picture
+  item-5 at level 1: text: 
+  item-6 at level 1: text: 
+  item-7 at level 1: text: 
+  item-8 at level 1: text: 
+  item-9 at level 1: text: 
+  item-10 at level 1: text: 
+  item-11 at level 1: text: 
+  item-12 at level 1: picture
--- a/tests/data/groundtruth/docling_v2/drawingml.docx.json
+++ b/tests/data/groundtruth/docling_v2/drawingml.docx.json
--- a/tests/data/groundtruth/docling_v2/drawingml.docx.md
+++ b/tests/data/groundtruth/docling_v2/drawingml.docx.md
@@ -0,0 +1,7 @@
+Text 2
+
+Text 1
+
+<!-- image -->
+
+<!-- image -->