Adding feature to import drawingml objects in doclingdocument

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
2025-12-08 12:48:28 +00:00 · 2025-08-14 16:25:59 +02:00
parent 5f050f94e1
commit cffe1f0ae5
2 changed files with 208 additions and 11 deletions
--- a/docling/backend/docx/drawingml/utils.py
+++ b/docling/backend/docx/drawingml/utils.py
@@ -0,0 +1,133 @@
+import os
+import shutil
+import subprocess
+from pathlib import Path
+from tempfile import mkdtemp
+from typing import Callable, Optional
+
+import pypdfium2
+from docx.document import Document
+from PIL import Image, ImageChops
+
+
+def get_docx_to_pdf_converter() -> Optional[Callable]:
+    """
+    Detects the best available DOCX to PDF tool and returns a conversion function.
+    The returned function accepts (input_path, output_path).
+    Returns None if no tool is available.
+    """
+
+    # Try LibreOffice
+    libreoffice_cmd = shutil.which("libreoffice") or shutil.which("soffice")
+    if libreoffice_cmd:
+
+        def convert_with_libreoffice(input_path, output_path):
+            subprocess.run(
+                [
+                    libreoffice_cmd,
+                    "--headless",
+                    "--convert-to",
+                    "pdf",
+                    "--outdir",
+                    os.path.dirname(output_path),
+                    input_path,
+                ],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=True,
+            )
+
+            expected_output = os.path.join(
+                os.path.dirname(output_path),
+                os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
+            )
+            if expected_output != output_path:
+                os.rename(expected_output, output_path)
+
+        return convert_with_libreoffice
+
+    # Try docx2pdf (MS Word required)
+    try:
+        import docx2pdf  # type: ignore
+
+        def convert_with_docx2pdf(input_path, output_path):
+            from docx2pdf import convert  # type: ignore
+
+            convert(input_path, os.path.dirname(output_path))
+
+            # Move result if necessary
+            expected_output = os.path.join(
+                os.path.dirname(output_path),
+                os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
+            )
+            if expected_output != output_path:
+                os.rename(expected_output, output_path)
+
+        return convert_with_docx2pdf
+    except ImportError:
+        pass
+
+    # Try Pandoc
+    try:
+        import pypandoc  # type: ignore
+
+        if shutil.which("pandoc"):
+
+            def convert_with_pandoc(input_path, output_path):
+                import pypandoc  # type: ignore
+
+                pypandoc.convert_file(input_path, "pdf", outputfile=output_path)
+
+            return convert_with_pandoc
+    except ImportError:
+        pass
+
+    # No tools found
+    return None
+
+
+def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
+    if bg_color is None:
+        bg_color = image.getpixel((0, 0))
+
+    bg = Image.new(image.mode, image.size, bg_color)
+    diff = ImageChops.difference(image, bg)
+    bbox = diff.getbbox()
+
+    if bbox:
+        left, upper, right, lower = bbox
+        left = max(0, left - padding)
+        upper = max(0, upper - padding)
+        right = min(image.width, right + padding)
+        lower = min(image.height, lower + padding)
+        return image.crop((left, upper, right, lower))
+    else:
+        return image
+
+
+def get_pil_from_dml_docx(
+    docx: Document, converter: Optional[Callable]
+) -> Optional[Image.Image]:
+    if converter is None:
+        return None
+
+    temp_dir = Path(mkdtemp())
+    temp_docx = Path(temp_dir / "drawing_only.docx")
+    temp_pdf = Path(temp_dir / "drawing_only.pdf")
+
+    # 1) Save docx temporarily
+    docx.save(str(temp_docx))
+
+    # 2) Export to PDF
+    converter(temp_docx, temp_pdf)
+
+    # 3) Load PDF as PNG
+    pdf = pypdfium2.PdfDocument(temp_pdf)
+    page = pdf[0]
+    image = crop_whitespace(page.render(scale=2).to_pil())
+    page.close()
+    pdf.close()
+
+    shutil.rmtree(temp_dir, ignore_errors=True)
+
+    return image
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -1,5 +1,6 @@
 import logging
 import re
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import Any, List, Optional, Union
@@ -30,6 +31,10 @@ from pydantic import AnyUrl
 from typing_extensions import override

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.docx.drawingml.utils import (
+    get_docx_to_pdf_converter,
+    get_pil_from_dml_docx,
+)
 from docling.backend.docx.latex.omml import oMath2Latex
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@@ -61,6 +66,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.equation_bookends: str = "<eq>{EQ}</eq>"
        # Track processed textbox elements to avoid duplication
        self.processed_textbox_elements: List[int] = []
+        self.docx_to_pdf_converter = get_docx_to_pdf_converter()

        for i in range(-1, self.max_levels):
            self.parents[i] = None
@@ -75,18 +81,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            "indents": [None],
        }

-        self.docx_obj = None
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.docx_obj = Document(self.path_or_stream)
-            elif isinstance(self.path_or_stream, Path):
-                self.docx_obj = Document(str(self.path_or_stream))
-
+        self.docx_obj = self.load_msword_file(
+            path_or_stream=self.path_or_stream, document_hash=self.document_hash
+        )
+        if self.docx_obj:
            self.valid = True
-        except Exception as e:
-            raise RuntimeError(
-                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
-            ) from e

    @override
    def is_valid(self) -> bool:
@@ -133,6 +132,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
            )

+    @staticmethod
+    def load_msword_file(
+        path_or_stream: Union[BytesIO, Path], document_hash: str
+    ) -> DocxDocument:
+        try:
+            if isinstance(path_or_stream, BytesIO):
+                return Document(path_or_stream)
+            elif isinstance(path_or_stream, Path):
+                return Document(str(path_or_stream))
+            else:
+                return None
+        except Exception as e:
+            raise RuntimeError(
+                f"MsWordDocumentBackend could not load document with hash {document_hash}"
+            ) from e
+
    def _update_history(
        self,
        name: str,
@@ -187,6 +202,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            }
            xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
            drawing_blip = xpath_expr(element)
+            drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)

            # Check for textbox content - check multiple textbox formats
            # Only process if the element hasn't been processed before
@@ -261,6 +277,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    and element.find(".//w:t", namespaces=namespaces) is not None
                ):
                    self._handle_text_elements(element, docx_obj, doc)
+            # Check for DrawingML elements
+            elif drawingml_els:
+                if self.docx_to_pdf_converter is None:
+                    _log.warning(
+                        "Found DrawingML elements in document, but no DOCX to PDF converters. "
+                        "If you want these exported, make sure you have "
+                        "LibreOffice (make sure its binary is in PATH) [Preferred], "
+                        "Word+docx2pdf, "
+                        "or pypandoc installed."
+                    )
+                else:
+                    self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
            # Check for the sdt containers, like table of contents
            elif tag_name in ["sdt"]:
                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -1170,3 +1198,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    caption=None,
                )
        return
+
+    def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
+        # 1) Make an empty copy of the original document
+        dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
+        body = dml_doc._element.body
+        for child in list(body):
+            body.remove(child)
+
+        # 2) Add DrawingML to empty document
+        new_para = dml_doc.add_paragraph()
+        new_r = new_para.add_run()
+        for dml in drawingml_els:
+            new_r._r.append(deepcopy(dml))
+
+        # 3) Export DOCX->PDF->PNG and save it in DoclingDocument
+        level = self._get_level()
+        try:
+            pil_image = get_pil_from_dml_docx(
+                dml_doc, converter=self.docx_to_pdf_converter
+            )
+            if pil_image is None:
+                raise UnidentifiedImageError
+
+            doc.add_picture(
+                parent=self.parents[level - 1],
+                image=ImageRef.from_pil(image=pil_image, dpi=72),
+                caption=None,
+            )
+        except (UnidentifiedImageError, OSError):
+            _log.warning("Warning: DrawingML image cannot be loaded by Pillow")
+            doc.add_picture(
+                parent=self.parents[level - 1],
+                caption=None,
+            )
+
+        return