feat(docx): Process drawingml objects in docx (#2453)

* Export of DrawingML figures into docling document * Adding libreoffice env var and libreoffice to checks image Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * DCO Remediation Commit for Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> I, Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>, hereby add my Signed-off-by to this commit: 9518fffcad Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * Enforcing apt get update Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * Only display drawingml warning once per document Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * add util to test libreoffice and exclude files from test when not found Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * check libreoffice only once Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Only initialise converter if needed Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> --------- Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-10-15 10:58:08 +02:00
parent 3e6da2c62d
commit 16829939cf
8 changed files with 512 additions and 25 deletions
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -80,10 +80,8 @@ jobs:
        - name: Install System Dependencies
          run: |
-            if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
+            sudo apt-get -qq update
-              sudo apt-get -qq update
+            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
            fi
            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
        - name: Set TESSDATA_PREFIX
          run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -149,10 +147,8 @@ jobs:
        - name: Install System Dependencies
          run: |
-            if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
+            sudo apt-get -qq update
-              sudo apt-get -qq update
+            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
            fi
            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
        - name: Set TESSDATA_PREFIX
          run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -223,10 +219,8 @@ jobs:
        - name: Install System Dependencies
          run: |
-            if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
+            sudo apt-get -qq update
-              sudo apt-get -qq update
+            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
            fi
            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
        - name: Set TESSDATA_PREFIX
          run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
--- a/docling/backend/docx/drawingml/utils.py
+++ b/docling/backend/docx/drawingml/utils.py
@@ -0,0 +1,131 @@
 import os
 import shutil
 import subprocess
 from pathlib import Path
 from tempfile import mkdtemp
 from typing import Callable, Optional
 import pypdfium2
 from docx.document import Document
 from PIL import Image, ImageChops
 def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
    """Return the libreoffice cmd and optionally test it."""
    libreoffice_cmd = (
        shutil.which("libreoffice")
        or shutil.which("soffice")
        or (
            "/Applications/LibreOffice.app/Contents/MacOS/soffice"
            if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
            else None
        )
    )
    if raise_if_unavailable:
        if libreoffice_cmd is None:
            raise RuntimeError("Libreoffice not found")
        # The following test will raise if the libreoffice_cmd cannot be used
        subprocess.run(
            [
                libreoffice_cmd,
                "-h",
            ],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
            check=True,
        )
    return libreoffice_cmd
 def get_docx_to_pdf_converter() -> Optional[Callable]:
    """
    Detects the best available DOCX to PDF tool and returns a conversion function.
    The returned function accepts (input_path, output_path).
    Returns None if no tool is available.
    """
    # Try LibreOffice
    libreoffice_cmd = get_libreoffice_cmd()
    if libreoffice_cmd:
        def convert_with_libreoffice(input_path, output_path):
            subprocess.run(
                [
                    libreoffice_cmd,
                    "--headless",
                    "--convert-to",
                    "pdf",
                    "--outdir",
                    os.path.dirname(output_path),
                    input_path,
                ],
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
                check=True,
            )
            expected_output = os.path.join(
                os.path.dirname(output_path),
                os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
            )
            if expected_output != output_path:
                os.rename(expected_output, output_path)
        return convert_with_libreoffice
    ## Space for other DOCX to PDF converters if available
    # No tools found
    return None
 def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
    if bg_color is None:
        bg_color = image.getpixel((0, 0))
    bg = Image.new(image.mode, image.size, bg_color)
    diff = ImageChops.difference(image, bg)
    bbox = diff.getbbox()
    if bbox:
        left, upper, right, lower = bbox
        left = max(0, left - padding)
        upper = max(0, upper - padding)
        right = min(image.width, right + padding)
        lower = min(image.height, lower + padding)
        return image.crop((left, upper, right, lower))
    else:
        return image
 def get_pil_from_dml_docx(
    docx: Document, converter: Optional[Callable]
 ) -> Optional[Image.Image]:
    if converter is None:
        return None
    temp_dir = Path(mkdtemp())
    temp_docx = Path(temp_dir / "drawing_only.docx")
    temp_pdf = Path(temp_dir / "drawing_only.pdf")
    # 1) Save docx temporarily
    docx.save(str(temp_docx))
    # 2) Export to PDF
    converter(temp_docx, temp_pdf)
    # 3) Load PDF as PNG
    pdf = pypdfium2.PdfDocument(temp_pdf)
    page = pdf[0]
    image = crop_whitespace(page.render(scale=2).to_pil())
    page.close()
    pdf.close()
    shutil.rmtree(temp_dir, ignore_errors=True)
    return image
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -1,8 +1,9 @@
 import logging
 import re
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 from docling_core.types.doc import (
    DocItemLabel,
@@ -33,6 +34,11 @@ from pydantic import AnyUrl
 from typing_extensions import override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.backend.docx.drawingml.utils import (
    get_docx_to_pdf_converter,
    get_libreoffice_cmd,
    get_pil_from_dml_docx,
 )
 from docling.backend.docx.latex.omml import oMath2Latex
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@@ -64,6 +70,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.equation_bookends: str = "<eq>{EQ}</eq>"
        # Track processed textbox elements to avoid duplication
        self.processed_textbox_elements: List[int] = []
        self.docx_to_pdf_converter: Optional[Callable] = None
        self.docx_to_pdf_converter_init = False
        self.display_drawingml_warning = True
        for i in range(-1, self.max_levels):
            self.parents[i] = None
@@ -80,18 +89,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            "indents": [None],
        }
-        self.docx_obj = None
+        self.docx_obj = self.load_msword_file(
-        try:
+            path_or_stream=self.path_or_stream, document_hash=self.document_hash
-            if isinstance(self.path_or_stream, BytesIO):
+        )
-                self.docx_obj = Document(self.path_or_stream)
+        if self.docx_obj:
            elif isinstance(self.path_or_stream, Path):
                self.docx_obj = Document(str(self.path_or_stream))
            self.valid = True
        except Exception as e:
            raise RuntimeError(
                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
            ) from e
    @override
    def is_valid(self) -> bool:
@@ -139,6 +141,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
            )
    @staticmethod
    def load_msword_file(
        path_or_stream: Union[BytesIO, Path], document_hash: str
    ) -> DocxDocument:
        try:
            if isinstance(path_or_stream, BytesIO):
                return Document(path_or_stream)
            elif isinstance(path_or_stream, Path):
                return Document(str(path_or_stream))
            else:
                return None
        except Exception as e:
            raise RuntimeError(
                f"MsWordDocumentBackend could not load document with hash {document_hash}"
            ) from e
    def _update_history(
        self,
        name: str,
@@ -195,6 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            }
            xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
            drawing_blip = xpath_expr(element)
            drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
            # Check for textbox content - check multiple textbox formats
            # Only process if the element hasn't been processed before
@@ -274,6 +293,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                ):
                    te1 = self._handle_text_elements(element, docx_obj, doc)
                    added_elements.extend(te1)
            # Check for DrawingML elements
            elif drawingml_els:
                if (
                    self.docx_to_pdf_converter is None
                    and self.docx_to_pdf_converter_init is False
                ):
                    self.docx_to_pdf_converter = get_docx_to_pdf_converter()
                    self.docx_to_pdf_converter_init = True
                if self.docx_to_pdf_converter is None:
                    if self.display_drawingml_warning:
                        if self.docx_to_pdf_converter is None:
                            _log.warning(
                                "Found DrawingML elements in document, but no DOCX to PDF converters. "
                                "If you want these exported, make sure you have "
                                "LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
                            )
                            self.display_drawingml_warning = False
                else:
                    self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
            # Check for the sdt containers, like table of contents
            elif tag_name in ["sdt"]:
                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -1381,3 +1420,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                )
                elem_ref.append(p3.get_ref())
        return elem_ref
    def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
        # 1) Make an empty copy of the original document
        dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
        body = dml_doc._element.body
        for child in list(body):
            body.remove(child)
        # 2) Add DrawingML to empty document
        new_para = dml_doc.add_paragraph()
        new_r = new_para.add_run()
        for dml in drawingml_els:
            new_r._r.append(deepcopy(dml))
        # 3) Export DOCX->PDF->PNG and save it in DoclingDocument
        level = self._get_level()
        try:
            pil_image = get_pil_from_dml_docx(
                dml_doc, converter=self.docx_to_pdf_converter
            )
            if pil_image is None:
                raise UnidentifiedImageError
            doc.add_picture(
                parent=self.parents[level - 1],
                image=ImageRef.from_pil(image=pil_image, dpi=72),
                caption=None,
            )
        except (UnidentifiedImageError, OSError):
            _log.warning("Warning: DrawingML image cannot be loaded by Pillow")
            doc.add_picture(
                parent=self.parents[level - 1],
                caption=None,
            )
        return
--- a/tests/data/docx/drawingml.docx
+++ b/tests/data/docx/drawingml.docx
--- a/tests/data/groundtruth/docling_v2/drawingml.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/drawingml.docx.itxt
@@ -0,0 +1,13 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: section: group textbox
    item-2 at level 2: text: Text 2
    item-3 at level 2: text: Text 1
  item-4 at level 1: picture
  item-5 at level 1: text: 
  item-6 at level 1: text: 
  item-7 at level 1: text: 
  item-8 at level 1: text: 
  item-9 at level 1: text: 
  item-10 at level 1: text: 
  item-11 at level 1: text: 
  item-12 at level 1: picture
--- a/tests/data/groundtruth/docling_v2/drawingml.docx.json
+++ b/tests/data/groundtruth/docling_v2/drawingml.docx.json
--- a/tests/data/groundtruth/docling_v2/drawingml.docx.md
+++ b/tests/data/groundtruth/docling_v2/drawingml.docx.md
@@ -0,0 +1,7 @@
 Text 2
 Text 1
 <!-- image -->
 <!-- image -->
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@@ -1,7 +1,9 @@
 import os
 from pathlib import Path
 import pytest
 from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import (
@@ -17,6 +19,7 @@ from .test_data_gen_flag import GEN_TEST_DATA
 from .verify_utils import verify_document, verify_export
 GENERATE = GEN_TEST_DATA
 IS_CI = bool(os.getenv("CI"))
@pytest.mark.xfail(strict=False)
@@ -84,8 +87,22 @@ def get_converter():
 def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
    converter = get_converter()
    has_libreoffice = False
    try:
        cmd = get_libreoffice_cmd(raise_if_unavailable=True)
        if cmd is not None:
            has_libreoffice = True
    except Exception:
        pass
    for docx_path in docx_paths:
-        # print(f"converting {docx_path}")
+        if (
            not IS_CI
            and not has_libreoffice
            and str(docx_path) in ("tests/data/docx/drawingml.docx",)
        ):
            print(f"Skipping {docx_path} because no Libreoffice is installed.")
            continue
        gt_path = (
            docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name