mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
Export of DrawingML figures into docling document
This commit is contained in:
99
docling/backend/docx/drawingml/utils.py
Normal file
99
docling/backend/docx/drawingml/utils.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pypdfium2
|
||||
from docx.document import Document
|
||||
from PIL import Image, ImageChops
|
||||
|
||||
|
||||
def get_docx_to_pdf_converter() -> Optional[Callable]:
|
||||
"""
|
||||
Detects the best available DOCX to PDF tool and returns a conversion function.
|
||||
The returned function accepts (input_path, output_path).
|
||||
Returns None if no tool is available.
|
||||
"""
|
||||
|
||||
# Try LibreOffice
|
||||
libreoffice_cmd = shutil.which("libreoffice") or shutil.which("soffice")
|
||||
if libreoffice_cmd:
|
||||
|
||||
def convert_with_libreoffice(input_path, output_path):
|
||||
subprocess.run(
|
||||
[
|
||||
libreoffice_cmd,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"pdf",
|
||||
"--outdir",
|
||||
os.path.dirname(output_path),
|
||||
input_path,
|
||||
],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True,
|
||||
)
|
||||
|
||||
expected_output = os.path.join(
|
||||
os.path.dirname(output_path),
|
||||
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
|
||||
)
|
||||
if expected_output != output_path:
|
||||
os.rename(expected_output, output_path)
|
||||
|
||||
return convert_with_libreoffice
|
||||
|
||||
## Space for other DOCX to PDF converters if available
|
||||
|
||||
# No tools found
|
||||
return None
|
||||
|
||||
|
||||
def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
|
||||
if bg_color is None:
|
||||
bg_color = image.getpixel((0, 0))
|
||||
|
||||
bg = Image.new(image.mode, image.size, bg_color)
|
||||
diff = ImageChops.difference(image, bg)
|
||||
bbox = diff.getbbox()
|
||||
|
||||
if bbox:
|
||||
left, upper, right, lower = bbox
|
||||
left = max(0, left - padding)
|
||||
upper = max(0, upper - padding)
|
||||
right = min(image.width, right + padding)
|
||||
lower = min(image.height, lower + padding)
|
||||
return image.crop((left, upper, right, lower))
|
||||
else:
|
||||
return image
|
||||
|
||||
|
||||
def get_pil_from_dml_docx(
|
||||
docx: Document, converter: Optional[Callable]
|
||||
) -> Optional[Image.Image]:
|
||||
if converter is None:
|
||||
return None
|
||||
|
||||
temp_dir = Path(mkdtemp())
|
||||
temp_docx = Path(temp_dir / "drawing_only.docx")
|
||||
temp_pdf = Path(temp_dir / "drawing_only.pdf")
|
||||
|
||||
# 1) Save docx temporarily
|
||||
docx.save(str(temp_docx))
|
||||
|
||||
# 2) Export to PDF
|
||||
converter(temp_docx, temp_pdf)
|
||||
|
||||
# 3) Load PDF as PNG
|
||||
pdf = pypdfium2.PdfDocument(temp_pdf)
|
||||
page = pdf[0]
|
||||
image = crop_whitespace(page.render(scale=2).to_pil())
|
||||
page.close()
|
||||
pdf.close()
|
||||
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
return image
|
||||
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Union
|
||||
@@ -33,6 +34,10 @@ from pydantic import AnyUrl
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.docx.drawingml.utils import (
|
||||
get_docx_to_pdf_converter,
|
||||
get_pil_from_dml_docx,
|
||||
)
|
||||
from docling.backend.docx.latex.omml import oMath2Latex
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
@@ -64,6 +69,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
||||
# Track processed textbox elements to avoid duplication
|
||||
self.processed_textbox_elements: List[int] = []
|
||||
# Get docx 2 pdf converter if available
|
||||
self.docx_to_pdf_converter = get_docx_to_pdf_converter()
|
||||
|
||||
for i in range(-1, self.max_levels):
|
||||
self.parents[i] = None
|
||||
@@ -80,18 +87,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
"indents": [None],
|
||||
}
|
||||
|
||||
self.docx_obj = None
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.docx_obj = Document(self.path_or_stream)
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
self.docx_obj = Document(str(self.path_or_stream))
|
||||
|
||||
self.docx_obj = self.load_msword_file(
|
||||
path_or_stream=self.path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
if self.docx_obj:
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
@override
|
||||
def is_valid(self) -> bool:
|
||||
@@ -139,6 +139,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def load_msword_file(
|
||||
path_or_stream: Union[BytesIO, Path], document_hash: str
|
||||
) -> DocxDocument:
|
||||
try:
|
||||
if isinstance(path_or_stream, BytesIO):
|
||||
return Document(path_or_stream)
|
||||
elif isinstance(path_or_stream, Path):
|
||||
return Document(str(path_or_stream))
|
||||
else:
|
||||
return None
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsWordDocumentBackend could not load document with hash {document_hash}"
|
||||
) from e
|
||||
|
||||
def _update_history(
|
||||
self,
|
||||
name: str,
|
||||
@@ -195,6 +211,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
}
|
||||
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
||||
drawing_blip = xpath_expr(element)
|
||||
drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
|
||||
|
||||
# Check for textbox content - check multiple textbox formats
|
||||
# Only process if the element hasn't been processed before
|
||||
@@ -274,6 +291,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
):
|
||||
te1 = self._handle_text_elements(element, docx_obj, doc)
|
||||
added_elements.extend(te1)
|
||||
# Check for DrawingML elements
|
||||
elif drawingml_els:
|
||||
if self.docx_to_pdf_converter is None:
|
||||
_log.warning(
|
||||
"Found DrawingML elements in document, but no DOCX to PDF converters. "
|
||||
"If you want these exported, make sure you have "
|
||||
"LibreOffice binary in PATH. "
|
||||
)
|
||||
else:
|
||||
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
|
||||
# Check for the sdt containers, like table of contents
|
||||
elif tag_name in ["sdt"]:
|
||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||
@@ -1381,3 +1408,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
elem_ref.append(p3.get_ref())
|
||||
return elem_ref
|
||||
|
||||
def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
|
||||
# 1) Make an empty copy of the original document
|
||||
dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
|
||||
body = dml_doc._element.body
|
||||
for child in list(body):
|
||||
body.remove(child)
|
||||
|
||||
# 2) Add DrawingML to empty document
|
||||
new_para = dml_doc.add_paragraph()
|
||||
new_r = new_para.add_run()
|
||||
for dml in drawingml_els:
|
||||
new_r._r.append(deepcopy(dml))
|
||||
|
||||
# 3) Export DOCX->PDF->PNG and save it in DoclingDocument
|
||||
level = self._get_level()
|
||||
try:
|
||||
pil_image = get_pil_from_dml_docx(
|
||||
dml_doc, converter=self.docx_to_pdf_converter
|
||||
)
|
||||
if pil_image is None:
|
||||
raise UnidentifiedImageError
|
||||
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
)
|
||||
except (UnidentifiedImageError, OSError):
|
||||
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
caption=None,
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
BIN
tests/data/docx/drawingml.docx
vendored
Normal file
BIN
tests/data/docx/drawingml.docx
vendored
Normal file
Binary file not shown.
13
tests/data/groundtruth/docling_v2/drawingml.docx.itxt
vendored
Normal file
13
tests/data/groundtruth/docling_v2/drawingml.docx.itxt
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group textbox
|
||||
item-2 at level 2: text: Text 2
|
||||
item-3 at level 2: text: Text 1
|
||||
item-4 at level 1: picture
|
||||
item-5 at level 1: text:
|
||||
item-6 at level 1: text:
|
||||
item-7 at level 1: text:
|
||||
item-8 at level 1: text:
|
||||
item-9 at level 1: text:
|
||||
item-10 at level 1: text:
|
||||
item-11 at level 1: text:
|
||||
item-12 at level 1: picture
|
||||
250
tests/data/groundtruth/docling_v2/drawingml.docx.json
vendored
Normal file
250
tests/data/groundtruth/docling_v2/drawingml.docx.json
vendored
Normal file
File diff suppressed because one or more lines are too long
7
tests/data/groundtruth/docling_v2/drawingml.docx.md
vendored
Normal file
7
tests/data/groundtruth/docling_v2/drawingml.docx.md
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
Text 2
|
||||
|
||||
Text 1
|
||||
|
||||
<!-- image -->
|
||||
|
||||
<!-- image -->
|
||||
Reference in New Issue
Block a user