mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
Adding feature to import drawingml objects in doclingdocument
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
133
docling/backend/docx/drawingml/utils.py
Normal file
133
docling/backend/docx/drawingml/utils.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pypdfium2
|
||||
from docx.document import Document
|
||||
from PIL import Image, ImageChops
|
||||
|
||||
|
||||
def get_docx_to_pdf_converter() -> Optional[Callable]:
|
||||
"""
|
||||
Detects the best available DOCX to PDF tool and returns a conversion function.
|
||||
The returned function accepts (input_path, output_path).
|
||||
Returns None if no tool is available.
|
||||
"""
|
||||
|
||||
# Try LibreOffice
|
||||
libreoffice_cmd = shutil.which("libreoffice") or shutil.which("soffice")
|
||||
if libreoffice_cmd:
|
||||
|
||||
def convert_with_libreoffice(input_path, output_path):
|
||||
subprocess.run(
|
||||
[
|
||||
libreoffice_cmd,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"pdf",
|
||||
"--outdir",
|
||||
os.path.dirname(output_path),
|
||||
input_path,
|
||||
],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True,
|
||||
)
|
||||
|
||||
expected_output = os.path.join(
|
||||
os.path.dirname(output_path),
|
||||
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
|
||||
)
|
||||
if expected_output != output_path:
|
||||
os.rename(expected_output, output_path)
|
||||
|
||||
return convert_with_libreoffice
|
||||
|
||||
# Try docx2pdf (MS Word required)
|
||||
try:
|
||||
import docx2pdf # type: ignore
|
||||
|
||||
def convert_with_docx2pdf(input_path, output_path):
|
||||
from docx2pdf import convert # type: ignore
|
||||
|
||||
convert(input_path, os.path.dirname(output_path))
|
||||
|
||||
# Move result if necessary
|
||||
expected_output = os.path.join(
|
||||
os.path.dirname(output_path),
|
||||
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
|
||||
)
|
||||
if expected_output != output_path:
|
||||
os.rename(expected_output, output_path)
|
||||
|
||||
return convert_with_docx2pdf
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Try Pandoc
|
||||
try:
|
||||
import pypandoc # type: ignore
|
||||
|
||||
if shutil.which("pandoc"):
|
||||
|
||||
def convert_with_pandoc(input_path, output_path):
|
||||
import pypandoc # type: ignore
|
||||
|
||||
pypandoc.convert_file(input_path, "pdf", outputfile=output_path)
|
||||
|
||||
return convert_with_pandoc
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# No tools found
|
||||
return None
|
||||
|
||||
|
||||
def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
|
||||
if bg_color is None:
|
||||
bg_color = image.getpixel((0, 0))
|
||||
|
||||
bg = Image.new(image.mode, image.size, bg_color)
|
||||
diff = ImageChops.difference(image, bg)
|
||||
bbox = diff.getbbox()
|
||||
|
||||
if bbox:
|
||||
left, upper, right, lower = bbox
|
||||
left = max(0, left - padding)
|
||||
upper = max(0, upper - padding)
|
||||
right = min(image.width, right + padding)
|
||||
lower = min(image.height, lower + padding)
|
||||
return image.crop((left, upper, right, lower))
|
||||
else:
|
||||
return image
|
||||
|
||||
|
||||
def get_pil_from_dml_docx(
|
||||
docx: Document, converter: Optional[Callable]
|
||||
) -> Optional[Image.Image]:
|
||||
if converter is None:
|
||||
return None
|
||||
|
||||
temp_dir = Path(mkdtemp())
|
||||
temp_docx = Path(temp_dir / "drawing_only.docx")
|
||||
temp_pdf = Path(temp_dir / "drawing_only.pdf")
|
||||
|
||||
# 1) Save docx temporarily
|
||||
docx.save(str(temp_docx))
|
||||
|
||||
# 2) Export to PDF
|
||||
converter(temp_docx, temp_pdf)
|
||||
|
||||
# 3) Load PDF as PNG
|
||||
pdf = pypdfium2.PdfDocument(temp_pdf)
|
||||
page = pdf[0]
|
||||
image = crop_whitespace(page.render(scale=2).to_pil())
|
||||
page.close()
|
||||
pdf.close()
|
||||
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
return image
|
||||
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Union
|
||||
@@ -30,6 +31,10 @@ from pydantic import AnyUrl
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.docx.drawingml.utils import (
|
||||
get_docx_to_pdf_converter,
|
||||
get_pil_from_dml_docx,
|
||||
)
|
||||
from docling.backend.docx.latex.omml import oMath2Latex
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
@@ -61,6 +66,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
||||
# Track processed textbox elements to avoid duplication
|
||||
self.processed_textbox_elements: List[int] = []
|
||||
self.docx_to_pdf_converter = get_docx_to_pdf_converter()
|
||||
|
||||
for i in range(-1, self.max_levels):
|
||||
self.parents[i] = None
|
||||
@@ -75,18 +81,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
"indents": [None],
|
||||
}
|
||||
|
||||
self.docx_obj = None
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.docx_obj = Document(self.path_or_stream)
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
self.docx_obj = Document(str(self.path_or_stream))
|
||||
|
||||
self.docx_obj = self.load_msword_file(
|
||||
path_or_stream=self.path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
if self.docx_obj:
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
@override
|
||||
def is_valid(self) -> bool:
|
||||
@@ -133,6 +132,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def load_msword_file(
|
||||
path_or_stream: Union[BytesIO, Path], document_hash: str
|
||||
) -> DocxDocument:
|
||||
try:
|
||||
if isinstance(path_or_stream, BytesIO):
|
||||
return Document(path_or_stream)
|
||||
elif isinstance(path_or_stream, Path):
|
||||
return Document(str(path_or_stream))
|
||||
else:
|
||||
return None
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsWordDocumentBackend could not load document with hash {document_hash}"
|
||||
) from e
|
||||
|
||||
def _update_history(
|
||||
self,
|
||||
name: str,
|
||||
@@ -187,6 +202,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
}
|
||||
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
||||
drawing_blip = xpath_expr(element)
|
||||
drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
|
||||
|
||||
# Check for textbox content - check multiple textbox formats
|
||||
# Only process if the element hasn't been processed before
|
||||
@@ -261,6 +277,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
and element.find(".//w:t", namespaces=namespaces) is not None
|
||||
):
|
||||
self._handle_text_elements(element, docx_obj, doc)
|
||||
# Check for DrawingML elements
|
||||
elif drawingml_els:
|
||||
if self.docx_to_pdf_converter is None:
|
||||
_log.warning(
|
||||
"Found DrawingML elements in document, but no DOCX to PDF converters. "
|
||||
"If you want these exported, make sure you have "
|
||||
"LibreOffice (make sure its binary is in PATH) [Preferred], "
|
||||
"Word+docx2pdf, "
|
||||
"or pypandoc installed."
|
||||
)
|
||||
else:
|
||||
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
|
||||
# Check for the sdt containers, like table of contents
|
||||
elif tag_name in ["sdt"]:
|
||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||
@@ -1170,3 +1198,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
caption=None,
|
||||
)
|
||||
return
|
||||
|
||||
def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
|
||||
# 1) Make an empty copy of the original document
|
||||
dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
|
||||
body = dml_doc._element.body
|
||||
for child in list(body):
|
||||
body.remove(child)
|
||||
|
||||
# 2) Add DrawingML to empty document
|
||||
new_para = dml_doc.add_paragraph()
|
||||
new_r = new_para.add_run()
|
||||
for dml in drawingml_els:
|
||||
new_r._r.append(deepcopy(dml))
|
||||
|
||||
# 3) Export DOCX->PDF->PNG and save it in DoclingDocument
|
||||
level = self._get_level()
|
||||
try:
|
||||
pil_image = get_pil_from_dml_docx(
|
||||
dml_doc, converter=self.docx_to_pdf_converter
|
||||
)
|
||||
if pil_image is None:
|
||||
raise UnidentifiedImageError
|
||||
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
)
|
||||
except (UnidentifiedImageError, OSError):
|
||||
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
caption=None,
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user