mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat(docx): Process drawingml objects in docx (#2453)
* Export of DrawingML figures into docling document
* Adding libreoffice env var and libreoffice to checks image
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* DCO Remediation Commit for Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
I, Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>, hereby add my Signed-off-by to this commit: 9518fffcad
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* Enforcing apt get update
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* Only display drawingml warning once per document
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* add util to test libreoffice and exclude files from test when not found
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
* check libreoffice only once
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
* Only initialise converter if needed
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
---------
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
3e6da2c62d
commit
16829939cf
12
.github/workflows/checks.yml
vendored
12
.github/workflows/checks.yml
vendored
@@ -80,10 +80,8 @@ jobs:
|
||||
|
||||
- name: Install System Dependencies
|
||||
run: |
|
||||
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
|
||||
sudo apt-get -qq update
|
||||
fi
|
||||
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
|
||||
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
|
||||
|
||||
- name: Set TESSDATA_PREFIX
|
||||
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
||||
@@ -149,10 +147,8 @@ jobs:
|
||||
|
||||
- name: Install System Dependencies
|
||||
run: |
|
||||
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
|
||||
sudo apt-get -qq update
|
||||
fi
|
||||
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
|
||||
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
|
||||
|
||||
- name: Set TESSDATA_PREFIX
|
||||
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
||||
@@ -223,10 +219,8 @@ jobs:
|
||||
|
||||
- name: Install System Dependencies
|
||||
run: |
|
||||
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
|
||||
sudo apt-get -qq update
|
||||
fi
|
||||
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
|
||||
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
|
||||
|
||||
- name: Set TESSDATA_PREFIX
|
||||
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
||||
|
||||
131
docling/backend/docx/drawingml/utils.py
Normal file
131
docling/backend/docx/drawingml/utils.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pypdfium2
|
||||
from docx.document import Document
|
||||
from PIL import Image, ImageChops
|
||||
|
||||
|
||||
def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
|
||||
"""Return the libreoffice cmd and optionally test it."""
|
||||
|
||||
libreoffice_cmd = (
|
||||
shutil.which("libreoffice")
|
||||
or shutil.which("soffice")
|
||||
or (
|
||||
"/Applications/LibreOffice.app/Contents/MacOS/soffice"
|
||||
if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
|
||||
else None
|
||||
)
|
||||
)
|
||||
|
||||
if raise_if_unavailable:
|
||||
if libreoffice_cmd is None:
|
||||
raise RuntimeError("Libreoffice not found")
|
||||
|
||||
# The following test will raise if the libreoffice_cmd cannot be used
|
||||
subprocess.run(
|
||||
[
|
||||
libreoffice_cmd,
|
||||
"-h",
|
||||
],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True,
|
||||
)
|
||||
|
||||
return libreoffice_cmd
|
||||
|
||||
|
||||
def get_docx_to_pdf_converter() -> Optional[Callable]:
|
||||
"""
|
||||
Detects the best available DOCX to PDF tool and returns a conversion function.
|
||||
The returned function accepts (input_path, output_path).
|
||||
Returns None if no tool is available.
|
||||
"""
|
||||
|
||||
# Try LibreOffice
|
||||
libreoffice_cmd = get_libreoffice_cmd()
|
||||
|
||||
if libreoffice_cmd:
|
||||
|
||||
def convert_with_libreoffice(input_path, output_path):
|
||||
subprocess.run(
|
||||
[
|
||||
libreoffice_cmd,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"pdf",
|
||||
"--outdir",
|
||||
os.path.dirname(output_path),
|
||||
input_path,
|
||||
],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True,
|
||||
)
|
||||
|
||||
expected_output = os.path.join(
|
||||
os.path.dirname(output_path),
|
||||
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
|
||||
)
|
||||
if expected_output != output_path:
|
||||
os.rename(expected_output, output_path)
|
||||
|
||||
return convert_with_libreoffice
|
||||
|
||||
## Space for other DOCX to PDF converters if available
|
||||
|
||||
# No tools found
|
||||
return None
|
||||
|
||||
|
||||
def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
|
||||
if bg_color is None:
|
||||
bg_color = image.getpixel((0, 0))
|
||||
|
||||
bg = Image.new(image.mode, image.size, bg_color)
|
||||
diff = ImageChops.difference(image, bg)
|
||||
bbox = diff.getbbox()
|
||||
|
||||
if bbox:
|
||||
left, upper, right, lower = bbox
|
||||
left = max(0, left - padding)
|
||||
upper = max(0, upper - padding)
|
||||
right = min(image.width, right + padding)
|
||||
lower = min(image.height, lower + padding)
|
||||
return image.crop((left, upper, right, lower))
|
||||
else:
|
||||
return image
|
||||
|
||||
|
||||
def get_pil_from_dml_docx(
|
||||
docx: Document, converter: Optional[Callable]
|
||||
) -> Optional[Image.Image]:
|
||||
if converter is None:
|
||||
return None
|
||||
|
||||
temp_dir = Path(mkdtemp())
|
||||
temp_docx = Path(temp_dir / "drawing_only.docx")
|
||||
temp_pdf = Path(temp_dir / "drawing_only.pdf")
|
||||
|
||||
# 1) Save docx temporarily
|
||||
docx.save(str(temp_docx))
|
||||
|
||||
# 2) Export to PDF
|
||||
converter(temp_docx, temp_pdf)
|
||||
|
||||
# 3) Load PDF as PNG
|
||||
pdf = pypdfium2.PdfDocument(temp_pdf)
|
||||
page = pdf[0]
|
||||
image = crop_whitespace(page.render(scale=2).to_pil())
|
||||
page.close()
|
||||
pdf.close()
|
||||
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
return image
|
||||
@@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Union
|
||||
from typing import Any, Callable, List, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
@@ -33,6 +34,11 @@ from pydantic import AnyUrl
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.docx.drawingml.utils import (
|
||||
get_docx_to_pdf_converter,
|
||||
get_libreoffice_cmd,
|
||||
get_pil_from_dml_docx,
|
||||
)
|
||||
from docling.backend.docx.latex.omml import oMath2Latex
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
@@ -64,6 +70,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
||||
# Track processed textbox elements to avoid duplication
|
||||
self.processed_textbox_elements: List[int] = []
|
||||
self.docx_to_pdf_converter: Optional[Callable] = None
|
||||
self.docx_to_pdf_converter_init = False
|
||||
self.display_drawingml_warning = True
|
||||
|
||||
for i in range(-1, self.max_levels):
|
||||
self.parents[i] = None
|
||||
@@ -80,18 +89,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
"indents": [None],
|
||||
}
|
||||
|
||||
self.docx_obj = None
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.docx_obj = Document(self.path_or_stream)
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
self.docx_obj = Document(str(self.path_or_stream))
|
||||
|
||||
self.docx_obj = self.load_msword_file(
|
||||
path_or_stream=self.path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
if self.docx_obj:
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
@override
|
||||
def is_valid(self) -> bool:
|
||||
@@ -139,6 +141,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def load_msword_file(
|
||||
path_or_stream: Union[BytesIO, Path], document_hash: str
|
||||
) -> DocxDocument:
|
||||
try:
|
||||
if isinstance(path_or_stream, BytesIO):
|
||||
return Document(path_or_stream)
|
||||
elif isinstance(path_or_stream, Path):
|
||||
return Document(str(path_or_stream))
|
||||
else:
|
||||
return None
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsWordDocumentBackend could not load document with hash {document_hash}"
|
||||
) from e
|
||||
|
||||
def _update_history(
|
||||
self,
|
||||
name: str,
|
||||
@@ -195,6 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
}
|
||||
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
||||
drawing_blip = xpath_expr(element)
|
||||
drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
|
||||
|
||||
# Check for textbox content - check multiple textbox formats
|
||||
# Only process if the element hasn't been processed before
|
||||
@@ -274,6 +293,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
):
|
||||
te1 = self._handle_text_elements(element, docx_obj, doc)
|
||||
added_elements.extend(te1)
|
||||
# Check for DrawingML elements
|
||||
elif drawingml_els:
|
||||
if (
|
||||
self.docx_to_pdf_converter is None
|
||||
and self.docx_to_pdf_converter_init is False
|
||||
):
|
||||
self.docx_to_pdf_converter = get_docx_to_pdf_converter()
|
||||
self.docx_to_pdf_converter_init = True
|
||||
|
||||
if self.docx_to_pdf_converter is None:
|
||||
if self.display_drawingml_warning:
|
||||
if self.docx_to_pdf_converter is None:
|
||||
_log.warning(
|
||||
"Found DrawingML elements in document, but no DOCX to PDF converters. "
|
||||
"If you want these exported, make sure you have "
|
||||
"LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
|
||||
)
|
||||
self.display_drawingml_warning = False
|
||||
else:
|
||||
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
|
||||
# Check for the sdt containers, like table of contents
|
||||
elif tag_name in ["sdt"]:
|
||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||
@@ -1381,3 +1420,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
elem_ref.append(p3.get_ref())
|
||||
return elem_ref
|
||||
|
||||
def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
|
||||
# 1) Make an empty copy of the original document
|
||||
dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
|
||||
body = dml_doc._element.body
|
||||
for child in list(body):
|
||||
body.remove(child)
|
||||
|
||||
# 2) Add DrawingML to empty document
|
||||
new_para = dml_doc.add_paragraph()
|
||||
new_r = new_para.add_run()
|
||||
for dml in drawingml_els:
|
||||
new_r._r.append(deepcopy(dml))
|
||||
|
||||
# 3) Export DOCX->PDF->PNG and save it in DoclingDocument
|
||||
level = self._get_level()
|
||||
try:
|
||||
pil_image = get_pil_from_dml_docx(
|
||||
dml_doc, converter=self.docx_to_pdf_converter
|
||||
)
|
||||
if pil_image is None:
|
||||
raise UnidentifiedImageError
|
||||
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
)
|
||||
except (UnidentifiedImageError, OSError):
|
||||
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
caption=None,
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
BIN
tests/data/docx/drawingml.docx
vendored
Normal file
BIN
tests/data/docx/drawingml.docx
vendored
Normal file
Binary file not shown.
13
tests/data/groundtruth/docling_v2/drawingml.docx.itxt
vendored
Normal file
13
tests/data/groundtruth/docling_v2/drawingml.docx.itxt
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group textbox
|
||||
item-2 at level 2: text: Text 2
|
||||
item-3 at level 2: text: Text 1
|
||||
item-4 at level 1: picture
|
||||
item-5 at level 1: text:
|
||||
item-6 at level 1: text:
|
||||
item-7 at level 1: text:
|
||||
item-8 at level 1: text:
|
||||
item-9 at level 1: text:
|
||||
item-10 at level 1: text:
|
||||
item-11 at level 1: text:
|
||||
item-12 at level 1: picture
|
||||
250
tests/data/groundtruth/docling_v2/drawingml.docx.json
vendored
Normal file
250
tests/data/groundtruth/docling_v2/drawingml.docx.json
vendored
Normal file
File diff suppressed because one or more lines are too long
7
tests/data/groundtruth/docling_v2/drawingml.docx.md
vendored
Normal file
7
tests/data/groundtruth/docling_v2/drawingml.docx.md
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
Text 2
|
||||
|
||||
Text 1
|
||||
|
||||
<!-- image -->
|
||||
|
||||
<!-- image -->
|
||||
@@ -1,7 +1,9 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
@@ -17,6 +19,7 @@ from .test_data_gen_flag import GEN_TEST_DATA
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = GEN_TEST_DATA
|
||||
IS_CI = bool(os.getenv("CI"))
|
||||
|
||||
|
||||
@pytest.mark.xfail(strict=False)
|
||||
@@ -84,8 +87,22 @@ def get_converter():
|
||||
def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
||||
converter = get_converter()
|
||||
|
||||
has_libreoffice = False
|
||||
try:
|
||||
cmd = get_libreoffice_cmd(raise_if_unavailable=True)
|
||||
if cmd is not None:
|
||||
has_libreoffice = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for docx_path in docx_paths:
|
||||
# print(f"converting {docx_path}")
|
||||
if (
|
||||
not IS_CI
|
||||
and not has_libreoffice
|
||||
and str(docx_path) in ("tests/data/docx/drawingml.docx",)
|
||||
):
|
||||
print(f"Skipping {docx_path} because no Libreoffice is installed.")
|
||||
continue
|
||||
|
||||
gt_path = (
|
||||
docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
|
||||
|
||||
Reference in New Issue
Block a user