mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat(docx): Process drawingml objects in docx (#2453)
* Export of DrawingML figures into docling document
* Adding libreoffice env var and libreoffice to checks image
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* DCO Remediation Commit for Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
I, Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>, hereby add my Signed-off-by to this commit: 9518fffcad
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* Enforcing apt get update
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* Only display drawingml warning once per document
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* add util to test libreoffice and exclude files from test when not found
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
* check libreoffice only once
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
* Only initialise converter if needed
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
---------
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
3e6da2c62d
commit
16829939cf
BIN
tests/data/docx/drawingml.docx
vendored
Normal file
BIN
tests/data/docx/drawingml.docx
vendored
Normal file
Binary file not shown.
13
tests/data/groundtruth/docling_v2/drawingml.docx.itxt
vendored
Normal file
13
tests/data/groundtruth/docling_v2/drawingml.docx.itxt
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group textbox
|
||||
item-2 at level 2: text: Text 2
|
||||
item-3 at level 2: text: Text 1
|
||||
item-4 at level 1: picture
|
||||
item-5 at level 1: text:
|
||||
item-6 at level 1: text:
|
||||
item-7 at level 1: text:
|
||||
item-8 at level 1: text:
|
||||
item-9 at level 1: text:
|
||||
item-10 at level 1: text:
|
||||
item-11 at level 1: text:
|
||||
item-12 at level 1: picture
|
||||
250
tests/data/groundtruth/docling_v2/drawingml.docx.json
vendored
Normal file
250
tests/data/groundtruth/docling_v2/drawingml.docx.json
vendored
Normal file
File diff suppressed because one or more lines are too long
7
tests/data/groundtruth/docling_v2/drawingml.docx.md
vendored
Normal file
7
tests/data/groundtruth/docling_v2/drawingml.docx.md
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
Text 2
|
||||
|
||||
Text 1
|
||||
|
||||
<!-- image -->
|
||||
|
||||
<!-- image -->
|
||||
@@ -1,7 +1,9 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
@@ -17,6 +19,7 @@ from .test_data_gen_flag import GEN_TEST_DATA
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = GEN_TEST_DATA
|
||||
IS_CI = bool(os.getenv("CI"))
|
||||
|
||||
|
||||
@pytest.mark.xfail(strict=False)
|
||||
@@ -84,8 +87,22 @@ def get_converter():
|
||||
def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
||||
converter = get_converter()
|
||||
|
||||
has_libreoffice = False
|
||||
try:
|
||||
cmd = get_libreoffice_cmd(raise_if_unavailable=True)
|
||||
if cmd is not None:
|
||||
has_libreoffice = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for docx_path in docx_paths:
|
||||
# print(f"converting {docx_path}")
|
||||
if (
|
||||
not IS_CI
|
||||
and not has_libreoffice
|
||||
and str(docx_path) in ("tests/data/docx/drawingml.docx",)
|
||||
):
|
||||
print(f"Skipping {docx_path} because no Libreoffice is installed.")
|
||||
continue
|
||||
|
||||
gt_path = (
|
||||
docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
|
||||
|
||||
Reference in New Issue
Block a user