mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat(docx): Process drawingml objects in docx (#2453)
* Export of DrawingML figures into docling document
* Adding libreoffice env var and libreoffice to checks image
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* DCO Remediation Commit for Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
I, Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>, hereby add my Signed-off-by to this commit: 9518fffcad
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* Enforcing apt get update
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* Only display drawingml warning once per document
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
* add util to test libreoffice and exclude files from test when not found
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
* check libreoffice only once
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
* Only initialise converter if needed
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
---------
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
3e6da2c62d
commit
16829939cf
18
.github/workflows/checks.yml
vendored
18
.github/workflows/checks.yml
vendored
@@ -80,10 +80,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Install System Dependencies
|
- name: Install System Dependencies
|
||||||
run: |
|
run: |
|
||||||
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
|
sudo apt-get -qq update
|
||||||
sudo apt-get -qq update
|
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
|
||||||
fi
|
|
||||||
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
|
|
||||||
|
|
||||||
- name: Set TESSDATA_PREFIX
|
- name: Set TESSDATA_PREFIX
|
||||||
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
||||||
@@ -149,10 +147,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Install System Dependencies
|
- name: Install System Dependencies
|
||||||
run: |
|
run: |
|
||||||
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
|
sudo apt-get -qq update
|
||||||
sudo apt-get -qq update
|
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
|
||||||
fi
|
|
||||||
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
|
|
||||||
|
|
||||||
- name: Set TESSDATA_PREFIX
|
- name: Set TESSDATA_PREFIX
|
||||||
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
||||||
@@ -223,10 +219,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Install System Dependencies
|
- name: Install System Dependencies
|
||||||
run: |
|
run: |
|
||||||
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
|
sudo apt-get -qq update
|
||||||
sudo apt-get -qq update
|
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
|
||||||
fi
|
|
||||||
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
|
|
||||||
|
|
||||||
- name: Set TESSDATA_PREFIX
|
- name: Set TESSDATA_PREFIX
|
||||||
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
||||||
|
|||||||
131
docling/backend/docx/drawingml/utils.py
Normal file
131
docling/backend/docx/drawingml/utils.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from tempfile import mkdtemp
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
|
import pypdfium2
|
||||||
|
from docx.document import Document
|
||||||
|
from PIL import Image, ImageChops
|
||||||
|
|
||||||
|
|
||||||
|
def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
|
||||||
|
"""Return the libreoffice cmd and optionally test it."""
|
||||||
|
|
||||||
|
libreoffice_cmd = (
|
||||||
|
shutil.which("libreoffice")
|
||||||
|
or shutil.which("soffice")
|
||||||
|
or (
|
||||||
|
"/Applications/LibreOffice.app/Contents/MacOS/soffice"
|
||||||
|
if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if raise_if_unavailable:
|
||||||
|
if libreoffice_cmd is None:
|
||||||
|
raise RuntimeError("Libreoffice not found")
|
||||||
|
|
||||||
|
# The following test will raise if the libreoffice_cmd cannot be used
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
libreoffice_cmd,
|
||||||
|
"-h",
|
||||||
|
],
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return libreoffice_cmd
|
||||||
|
|
||||||
|
|
||||||
|
def get_docx_to_pdf_converter() -> Optional[Callable]:
|
||||||
|
"""
|
||||||
|
Detects the best available DOCX to PDF tool and returns a conversion function.
|
||||||
|
The returned function accepts (input_path, output_path).
|
||||||
|
Returns None if no tool is available.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Try LibreOffice
|
||||||
|
libreoffice_cmd = get_libreoffice_cmd()
|
||||||
|
|
||||||
|
if libreoffice_cmd:
|
||||||
|
|
||||||
|
def convert_with_libreoffice(input_path, output_path):
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
libreoffice_cmd,
|
||||||
|
"--headless",
|
||||||
|
"--convert-to",
|
||||||
|
"pdf",
|
||||||
|
"--outdir",
|
||||||
|
os.path.dirname(output_path),
|
||||||
|
input_path,
|
||||||
|
],
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
expected_output = os.path.join(
|
||||||
|
os.path.dirname(output_path),
|
||||||
|
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
|
||||||
|
)
|
||||||
|
if expected_output != output_path:
|
||||||
|
os.rename(expected_output, output_path)
|
||||||
|
|
||||||
|
return convert_with_libreoffice
|
||||||
|
|
||||||
|
## Space for other DOCX to PDF converters if available
|
||||||
|
|
||||||
|
# No tools found
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
|
||||||
|
if bg_color is None:
|
||||||
|
bg_color = image.getpixel((0, 0))
|
||||||
|
|
||||||
|
bg = Image.new(image.mode, image.size, bg_color)
|
||||||
|
diff = ImageChops.difference(image, bg)
|
||||||
|
bbox = diff.getbbox()
|
||||||
|
|
||||||
|
if bbox:
|
||||||
|
left, upper, right, lower = bbox
|
||||||
|
left = max(0, left - padding)
|
||||||
|
upper = max(0, upper - padding)
|
||||||
|
right = min(image.width, right + padding)
|
||||||
|
lower = min(image.height, lower + padding)
|
||||||
|
return image.crop((left, upper, right, lower))
|
||||||
|
else:
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
def get_pil_from_dml_docx(
|
||||||
|
docx: Document, converter: Optional[Callable]
|
||||||
|
) -> Optional[Image.Image]:
|
||||||
|
if converter is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
temp_dir = Path(mkdtemp())
|
||||||
|
temp_docx = Path(temp_dir / "drawing_only.docx")
|
||||||
|
temp_pdf = Path(temp_dir / "drawing_only.pdf")
|
||||||
|
|
||||||
|
# 1) Save docx temporarily
|
||||||
|
docx.save(str(temp_docx))
|
||||||
|
|
||||||
|
# 2) Export to PDF
|
||||||
|
converter(temp_docx, temp_pdf)
|
||||||
|
|
||||||
|
# 3) Load PDF as PNG
|
||||||
|
pdf = pypdfium2.PdfDocument(temp_pdf)
|
||||||
|
page = pdf[0]
|
||||||
|
image = crop_whitespace(page.render(scale=2).to_pil())
|
||||||
|
page.close()
|
||||||
|
pdf.close()
|
||||||
|
|
||||||
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
return image
|
||||||
@@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from copy import deepcopy
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List, Optional, Union
|
from typing import Any, Callable, List, Optional, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@@ -33,6 +34,11 @@ from pydantic import AnyUrl
|
|||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
|
from docling.backend.docx.drawingml.utils import (
|
||||||
|
get_docx_to_pdf_converter,
|
||||||
|
get_libreoffice_cmd,
|
||||||
|
get_pil_from_dml_docx,
|
||||||
|
)
|
||||||
from docling.backend.docx.latex.omml import oMath2Latex
|
from docling.backend.docx.latex.omml import oMath2Latex
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
@@ -64,6 +70,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
||||||
# Track processed textbox elements to avoid duplication
|
# Track processed textbox elements to avoid duplication
|
||||||
self.processed_textbox_elements: List[int] = []
|
self.processed_textbox_elements: List[int] = []
|
||||||
|
self.docx_to_pdf_converter: Optional[Callable] = None
|
||||||
|
self.docx_to_pdf_converter_init = False
|
||||||
|
self.display_drawingml_warning = True
|
||||||
|
|
||||||
for i in range(-1, self.max_levels):
|
for i in range(-1, self.max_levels):
|
||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
@@ -80,18 +89,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
"indents": [None],
|
"indents": [None],
|
||||||
}
|
}
|
||||||
|
|
||||||
self.docx_obj = None
|
self.docx_obj = self.load_msword_file(
|
||||||
try:
|
path_or_stream=self.path_or_stream, document_hash=self.document_hash
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
)
|
||||||
self.docx_obj = Document(self.path_or_stream)
|
if self.docx_obj:
|
||||||
elif isinstance(self.path_or_stream, Path):
|
|
||||||
self.docx_obj = Document(str(self.path_or_stream))
|
|
||||||
|
|
||||||
self.valid = True
|
self.valid = True
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
@@ -139,6 +141,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_msword_file(
|
||||||
|
path_or_stream: Union[BytesIO, Path], document_hash: str
|
||||||
|
) -> DocxDocument:
|
||||||
|
try:
|
||||||
|
if isinstance(path_or_stream, BytesIO):
|
||||||
|
return Document(path_or_stream)
|
||||||
|
elif isinstance(path_or_stream, Path):
|
||||||
|
return Document(str(path_or_stream))
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"MsWordDocumentBackend could not load document with hash {document_hash}"
|
||||||
|
) from e
|
||||||
|
|
||||||
def _update_history(
|
def _update_history(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
@@ -195,6 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
}
|
}
|
||||||
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
||||||
drawing_blip = xpath_expr(element)
|
drawing_blip = xpath_expr(element)
|
||||||
|
drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
|
||||||
|
|
||||||
# Check for textbox content - check multiple textbox formats
|
# Check for textbox content - check multiple textbox formats
|
||||||
# Only process if the element hasn't been processed before
|
# Only process if the element hasn't been processed before
|
||||||
@@ -274,6 +293,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
):
|
):
|
||||||
te1 = self._handle_text_elements(element, docx_obj, doc)
|
te1 = self._handle_text_elements(element, docx_obj, doc)
|
||||||
added_elements.extend(te1)
|
added_elements.extend(te1)
|
||||||
|
# Check for DrawingML elements
|
||||||
|
elif drawingml_els:
|
||||||
|
if (
|
||||||
|
self.docx_to_pdf_converter is None
|
||||||
|
and self.docx_to_pdf_converter_init is False
|
||||||
|
):
|
||||||
|
self.docx_to_pdf_converter = get_docx_to_pdf_converter()
|
||||||
|
self.docx_to_pdf_converter_init = True
|
||||||
|
|
||||||
|
if self.docx_to_pdf_converter is None:
|
||||||
|
if self.display_drawingml_warning:
|
||||||
|
if self.docx_to_pdf_converter is None:
|
||||||
|
_log.warning(
|
||||||
|
"Found DrawingML elements in document, but no DOCX to PDF converters. "
|
||||||
|
"If you want these exported, make sure you have "
|
||||||
|
"LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
|
||||||
|
)
|
||||||
|
self.display_drawingml_warning = False
|
||||||
|
else:
|
||||||
|
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
|
||||||
# Check for the sdt containers, like table of contents
|
# Check for the sdt containers, like table of contents
|
||||||
elif tag_name in ["sdt"]:
|
elif tag_name in ["sdt"]:
|
||||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||||
@@ -1381,3 +1420,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
elem_ref.append(p3.get_ref())
|
elem_ref.append(p3.get_ref())
|
||||||
return elem_ref
|
return elem_ref
|
||||||
|
|
||||||
|
def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
|
||||||
|
# 1) Make an empty copy of the original document
|
||||||
|
dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
|
||||||
|
body = dml_doc._element.body
|
||||||
|
for child in list(body):
|
||||||
|
body.remove(child)
|
||||||
|
|
||||||
|
# 2) Add DrawingML to empty document
|
||||||
|
new_para = dml_doc.add_paragraph()
|
||||||
|
new_r = new_para.add_run()
|
||||||
|
for dml in drawingml_els:
|
||||||
|
new_r._r.append(deepcopy(dml))
|
||||||
|
|
||||||
|
# 3) Export DOCX->PDF->PNG and save it in DoclingDocument
|
||||||
|
level = self._get_level()
|
||||||
|
try:
|
||||||
|
pil_image = get_pil_from_dml_docx(
|
||||||
|
dml_doc, converter=self.docx_to_pdf_converter
|
||||||
|
)
|
||||||
|
if pil_image is None:
|
||||||
|
raise UnidentifiedImageError
|
||||||
|
|
||||||
|
doc.add_picture(
|
||||||
|
parent=self.parents[level - 1],
|
||||||
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||||
|
caption=None,
|
||||||
|
)
|
||||||
|
except (UnidentifiedImageError, OSError):
|
||||||
|
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
|
||||||
|
doc.add_picture(
|
||||||
|
parent=self.parents[level - 1],
|
||||||
|
caption=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
return
|
||||||
|
|||||||
BIN
tests/data/docx/drawingml.docx
vendored
Normal file
BIN
tests/data/docx/drawingml.docx
vendored
Normal file
Binary file not shown.
13
tests/data/groundtruth/docling_v2/drawingml.docx.itxt
vendored
Normal file
13
tests/data/groundtruth/docling_v2/drawingml.docx.itxt
vendored
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: section: group textbox
|
||||||
|
item-2 at level 2: text: Text 2
|
||||||
|
item-3 at level 2: text: Text 1
|
||||||
|
item-4 at level 1: picture
|
||||||
|
item-5 at level 1: text:
|
||||||
|
item-6 at level 1: text:
|
||||||
|
item-7 at level 1: text:
|
||||||
|
item-8 at level 1: text:
|
||||||
|
item-9 at level 1: text:
|
||||||
|
item-10 at level 1: text:
|
||||||
|
item-11 at level 1: text:
|
||||||
|
item-12 at level 1: picture
|
||||||
250
tests/data/groundtruth/docling_v2/drawingml.docx.json
vendored
Normal file
250
tests/data/groundtruth/docling_v2/drawingml.docx.json
vendored
Normal file
File diff suppressed because one or more lines are too long
7
tests/data/groundtruth/docling_v2/drawingml.docx.md
vendored
Normal file
7
tests/data/groundtruth/docling_v2/drawingml.docx.md
vendored
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
Text 2
|
||||||
|
|
||||||
|
Text 1
|
||||||
|
|
||||||
|
<!-- image -->
|
||||||
|
|
||||||
|
<!-- image -->
|
||||||
@@ -1,7 +1,9 @@
|
|||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
@@ -17,6 +19,7 @@ from .test_data_gen_flag import GEN_TEST_DATA
|
|||||||
from .verify_utils import verify_document, verify_export
|
from .verify_utils import verify_document, verify_export
|
||||||
|
|
||||||
GENERATE = GEN_TEST_DATA
|
GENERATE = GEN_TEST_DATA
|
||||||
|
IS_CI = bool(os.getenv("CI"))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(strict=False)
|
@pytest.mark.xfail(strict=False)
|
||||||
@@ -84,8 +87,22 @@ def get_converter():
|
|||||||
def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
|
has_libreoffice = False
|
||||||
|
try:
|
||||||
|
cmd = get_libreoffice_cmd(raise_if_unavailable=True)
|
||||||
|
if cmd is not None:
|
||||||
|
has_libreoffice = True
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
for docx_path in docx_paths:
|
for docx_path in docx_paths:
|
||||||
# print(f"converting {docx_path}")
|
if (
|
||||||
|
not IS_CI
|
||||||
|
and not has_libreoffice
|
||||||
|
and str(docx_path) in ("tests/data/docx/drawingml.docx",)
|
||||||
|
):
|
||||||
|
print(f"Skipping {docx_path} because no Libreoffice is installed.")
|
||||||
|
continue
|
||||||
|
|
||||||
gt_path = (
|
gt_path = (
|
||||||
docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
|
docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
|
||||||
|
|||||||
Reference in New Issue
Block a user