feat(docx): Process drawingml objects in docx (#2453)

* Export of DrawingML figures into docling document

* Adding libreoffice env var and libreoffice to checks image

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

* DCO Remediation Commit for Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

I, Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>, hereby add my Signed-off-by to this commit: 9518fffcad

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

* Enforcing apt get update

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

* Only display drawingml warning once per document

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

* add util to test libreoffice and exclude files from test when not found

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* check libreoffice only once

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Only initialise converter if needed

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

---------

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Rafael Teixeira de Lima
2025-10-15 10:58:08 +02:00
committed by GitHub
parent 3e6da2c62d
commit 16829939cf
8 changed files with 512 additions and 25 deletions

View File

@@ -1,7 +1,9 @@
import os
from pathlib import Path
import pytest
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
@@ -17,6 +19,7 @@ from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
IS_CI = bool(os.getenv("CI"))
@pytest.mark.xfail(strict=False)
@@ -84,8 +87,22 @@ def get_converter():
def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
converter = get_converter()
has_libreoffice = False
try:
cmd = get_libreoffice_cmd(raise_if_unavailable=True)
if cmd is not None:
has_libreoffice = True
except Exception:
pass
for docx_path in docx_paths:
# print(f"converting {docx_path}")
if (
not IS_CI
and not has_libreoffice
and str(docx_path) in ("tests/data/docx/drawingml.docx",)
):
print(f"Skipping {docx_path} because no Libreoffice is installed.")
continue
gt_path = (
docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name