From 94d1fd41cd0b1e29bf725022d7efb8682f4e847e Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 25 Nov 2024 20:20:45 +0100 Subject: [PATCH] Added safety try-except when trying to load pillow image from a docx blob. Added explicit dependency on lxml. Signed-off-by: Maksym Lysak --- docling/backend/msword_backend.py | 20 +++++++++++++------- poetry.lock | 7 ++++++- pyproject.toml | 1 + 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index a8b7e9d6..cb04672b 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -15,7 +15,7 @@ from docling_core.types.doc import ( ) from lxml import etree from lxml.etree import XPath -from PIL import Image +from PIL import Image, UnidentifiedImageError from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat @@ -509,10 +509,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): image_data = get_docx_image(element, drawing_blip) image_bytes = BytesIO(image_data) # Open the BytesIO object with PIL to create an Image - pil_image = Image.open(image_bytes) - doc.add_picture( - parent=self.parents[self.level], - image=ImageRef.from_pil(image=pil_image, dpi=72), - caption=None, - ) + try: + pil_image = Image.open(image_bytes) + doc.add_picture( + parent=self.parents[self.level], + image=ImageRef.from_pil(image=pil_image, dpi=72), + caption=None, + ) + except (UnidentifiedImageError, OSError) as e: + doc.add_picture( + parent=self.parents[self.level], + caption=None, + ) return diff --git a/poetry.lock b/poetry.lock index c117db24..67249e7f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5761,6 +5761,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -7353,4 +7358,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "6772098f9951e636c03d958b81f7df3ae0e8746be558fd031aeaa67f9bb45a79" +content-hash = "125c3fbca0f738cdb816af389a19ff9d93d4a05a2cd31902605e91c3f17caaae" diff --git a/pyproject.toml b/pyproject.toml index c0907b22..275383a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3" pandas = "^2.1.4" marko = "^2.1.2" openpyxl = "^3.1.5" +lxml = "^5.3.0" ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true } [tool.poetry.group.dev.dependencies]