feat(docx): Process drawingml objects in docx (#2453)

* Export of DrawingML figures into docling document

* Adding libreoffice env var and libreoffice to checks image

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

* DCO Remediation Commit for Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

I, Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>, hereby add my Signed-off-by to this commit: 9518fffcad

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

* Enforcing apt get update

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

* Only display drawingml warning once per document

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

* add util to test libreoffice and exclude files from test when not found

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* check libreoffice only once

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Only initialise converter if needed

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

---------

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Rafael Teixeira de Lima
2025-10-15 10:58:08 +02:00
committed by GitHub
parent 3e6da2c62d
commit 16829939cf
8 changed files with 512 additions and 25 deletions

View File

@@ -80,10 +80,8 @@ jobs:
- name: Install System Dependencies - name: Install System Dependencies
run: | run: |
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
sudo apt-get -qq update sudo apt-get -qq update
fi sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
- name: Set TESSDATA_PREFIX - name: Set TESSDATA_PREFIX
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -149,10 +147,8 @@ jobs:
- name: Install System Dependencies - name: Install System Dependencies
run: | run: |
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
sudo apt-get -qq update sudo apt-get -qq update
fi sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
- name: Set TESSDATA_PREFIX - name: Set TESSDATA_PREFIX
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -223,10 +219,8 @@ jobs:
- name: Install System Dependencies - name: Install System Dependencies
run: | run: |
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
sudo apt-get -qq update sudo apt-get -qq update
fi sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
- name: Set TESSDATA_PREFIX - name: Set TESSDATA_PREFIX
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"

View File

@@ -0,0 +1,131 @@
import os
import shutil
import subprocess
from pathlib import Path
from tempfile import mkdtemp
from typing import Callable, Optional
import pypdfium2
from docx.document import Document
from PIL import Image, ImageChops
def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
"""Return the libreoffice cmd and optionally test it."""
libreoffice_cmd = (
shutil.which("libreoffice")
or shutil.which("soffice")
or (
"/Applications/LibreOffice.app/Contents/MacOS/soffice"
if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
else None
)
)
if raise_if_unavailable:
if libreoffice_cmd is None:
raise RuntimeError("Libreoffice not found")
# The following test will raise if the libreoffice_cmd cannot be used
subprocess.run(
[
libreoffice_cmd,
"-h",
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True,
)
return libreoffice_cmd
def get_docx_to_pdf_converter() -> Optional[Callable]:
"""
Detects the best available DOCX to PDF tool and returns a conversion function.
The returned function accepts (input_path, output_path).
Returns None if no tool is available.
"""
# Try LibreOffice
libreoffice_cmd = get_libreoffice_cmd()
if libreoffice_cmd:
def convert_with_libreoffice(input_path, output_path):
subprocess.run(
[
libreoffice_cmd,
"--headless",
"--convert-to",
"pdf",
"--outdir",
os.path.dirname(output_path),
input_path,
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True,
)
expected_output = os.path.join(
os.path.dirname(output_path),
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
)
if expected_output != output_path:
os.rename(expected_output, output_path)
return convert_with_libreoffice
## Space for other DOCX to PDF converters if available
# No tools found
return None
def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
if bg_color is None:
bg_color = image.getpixel((0, 0))
bg = Image.new(image.mode, image.size, bg_color)
diff = ImageChops.difference(image, bg)
bbox = diff.getbbox()
if bbox:
left, upper, right, lower = bbox
left = max(0, left - padding)
upper = max(0, upper - padding)
right = min(image.width, right + padding)
lower = min(image.height, lower + padding)
return image.crop((left, upper, right, lower))
else:
return image
def get_pil_from_dml_docx(
docx: Document, converter: Optional[Callable]
) -> Optional[Image.Image]:
if converter is None:
return None
temp_dir = Path(mkdtemp())
temp_docx = Path(temp_dir / "drawing_only.docx")
temp_pdf = Path(temp_dir / "drawing_only.pdf")
# 1) Save docx temporarily
docx.save(str(temp_docx))
# 2) Export to PDF
converter(temp_docx, temp_pdf)
# 3) Load PDF as PNG
pdf = pypdfium2.PdfDocument(temp_pdf)
page = pdf[0]
image = crop_whitespace(page.render(scale=2).to_pil())
page.close()
pdf.close()
shutil.rmtree(temp_dir, ignore_errors=True)
return image

View File

@@ -1,8 +1,9 @@
import logging import logging
import re import re
from copy import deepcopy
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Any, List, Optional, Union from typing import Any, Callable, List, Optional, Union
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
@@ -33,6 +34,11 @@ from pydantic import AnyUrl
from typing_extensions import override from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.docx.drawingml.utils import (
get_docx_to_pdf_converter,
get_libreoffice_cmd,
get_pil_from_dml_docx,
)
from docling.backend.docx.latex.omml import oMath2Latex from docling.backend.docx.latex.omml import oMath2Latex
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
@@ -64,6 +70,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.equation_bookends: str = "<eq>{EQ}</eq>" self.equation_bookends: str = "<eq>{EQ}</eq>"
# Track processed textbox elements to avoid duplication # Track processed textbox elements to avoid duplication
self.processed_textbox_elements: List[int] = [] self.processed_textbox_elements: List[int] = []
self.docx_to_pdf_converter: Optional[Callable] = None
self.docx_to_pdf_converter_init = False
self.display_drawingml_warning = True
for i in range(-1, self.max_levels): for i in range(-1, self.max_levels):
self.parents[i] = None self.parents[i] = None
@@ -80,18 +89,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
"indents": [None], "indents": [None],
} }
self.docx_obj = None self.docx_obj = self.load_msword_file(
try: path_or_stream=self.path_or_stream, document_hash=self.document_hash
if isinstance(self.path_or_stream, BytesIO): )
self.docx_obj = Document(self.path_or_stream) if self.docx_obj:
elif isinstance(self.path_or_stream, Path):
self.docx_obj = Document(str(self.path_or_stream))
self.valid = True self.valid = True
except Exception as e:
raise RuntimeError(
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
) from e
@override @override
def is_valid(self) -> bool: def is_valid(self) -> bool:
@@ -139,6 +141,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
f"Cannot convert doc with {self.document_hash} because the backend failed to init." f"Cannot convert doc with {self.document_hash} because the backend failed to init."
) )
@staticmethod
def load_msword_file(
path_or_stream: Union[BytesIO, Path], document_hash: str
) -> DocxDocument:
try:
if isinstance(path_or_stream, BytesIO):
return Document(path_or_stream)
elif isinstance(path_or_stream, Path):
return Document(str(path_or_stream))
else:
return None
except Exception as e:
raise RuntimeError(
f"MsWordDocumentBackend could not load document with hash {document_hash}"
) from e
def _update_history( def _update_history(
self, self,
name: str, name: str,
@@ -195,6 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
} }
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces) xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
drawing_blip = xpath_expr(element) drawing_blip = xpath_expr(element)
drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
# Check for textbox content - check multiple textbox formats # Check for textbox content - check multiple textbox formats
# Only process if the element hasn't been processed before # Only process if the element hasn't been processed before
@@ -274,6 +293,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
): ):
te1 = self._handle_text_elements(element, docx_obj, doc) te1 = self._handle_text_elements(element, docx_obj, doc)
added_elements.extend(te1) added_elements.extend(te1)
# Check for DrawingML elements
elif drawingml_els:
if (
self.docx_to_pdf_converter is None
and self.docx_to_pdf_converter_init is False
):
self.docx_to_pdf_converter = get_docx_to_pdf_converter()
self.docx_to_pdf_converter_init = True
if self.docx_to_pdf_converter is None:
if self.display_drawingml_warning:
if self.docx_to_pdf_converter is None:
_log.warning(
"Found DrawingML elements in document, but no DOCX to PDF converters. "
"If you want these exported, make sure you have "
"LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
)
self.display_drawingml_warning = False
else:
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
# Check for the sdt containers, like table of contents # Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]: elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -1381,3 +1420,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
elem_ref.append(p3.get_ref()) elem_ref.append(p3.get_ref())
return elem_ref return elem_ref
def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
# 1) Make an empty copy of the original document
dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
body = dml_doc._element.body
for child in list(body):
body.remove(child)
# 2) Add DrawingML to empty document
new_para = dml_doc.add_paragraph()
new_r = new_para.add_run()
for dml in drawingml_els:
new_r._r.append(deepcopy(dml))
# 3) Export DOCX->PDF->PNG and save it in DoclingDocument
level = self._get_level()
try:
pil_image = get_pil_from_dml_docx(
dml_doc, converter=self.docx_to_pdf_converter
)
if pil_image is None:
raise UnidentifiedImageError
doc.add_picture(
parent=self.parents[level - 1],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except (UnidentifiedImageError, OSError):
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
doc.add_picture(
parent=self.parents[level - 1],
caption=None,
)
return

BIN
tests/data/docx/drawingml.docx vendored Normal file

Binary file not shown.

View File

@@ -0,0 +1,13 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group textbox
item-2 at level 2: text: Text 2
item-3 at level 2: text: Text 1
item-4 at level 1: picture
item-5 at level 1: text:
item-6 at level 1: text:
item-7 at level 1: text:
item-8 at level 1: text:
item-9 at level 1: text:
item-10 at level 1: text:
item-11 at level 1: text:
item-12 at level 1: picture

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,7 @@
Text 2
Text 1
<!-- image -->
<!-- image -->

View File

@@ -1,7 +1,9 @@
import os
from pathlib import Path from pathlib import Path
import pytest import pytest
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ( from docling.datamodel.document import (
@@ -17,6 +19,7 @@ from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA GENERATE = GEN_TEST_DATA
IS_CI = bool(os.getenv("CI"))
@pytest.mark.xfail(strict=False) @pytest.mark.xfail(strict=False)
@@ -84,8 +87,22 @@ def get_converter():
def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
converter = get_converter() converter = get_converter()
has_libreoffice = False
try:
cmd = get_libreoffice_cmd(raise_if_unavailable=True)
if cmd is not None:
has_libreoffice = True
except Exception:
pass
for docx_path in docx_paths: for docx_path in docx_paths:
# print(f"converting {docx_path}") if (
not IS_CI
and not has_libreoffice
and str(docx_path) in ("tests/data/docx/drawingml.docx",)
):
print(f"Skipping {docx_path} because no Libreoffice is installed.")
continue
gt_path = ( gt_path = (
docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name