fix: enrichment of documents without pages metadata (pptx and xlsx) (#2401)

fix logic for pptx and xlsx

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-10-07 18:28:51 +02:00
committed by GitHub
parent 9705f4020c
commit 0610d01afa

View File

@@ -173,11 +173,11 @@ class BaseItemAndImageEnrichmentModel(
assert isinstance(element, DocItem) assert isinstance(element, DocItem)
# Allow the case of documents without page images but embedded images (e.g. Word and HTML docs) # Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
if len(element.prov) == 0 and isinstance(element, PictureItem): if isinstance(element, PictureItem):
embedded_im = element.get_image(conv_res.document) embedded_im = element.get_image(conv_res.document)
if embedded_im is not None: if embedded_im is not None:
return ItemAndImageEnrichmentElement(item=element, image=embedded_im) return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
else: elif len(element.prov) == 0:
return None return None
# Crop the image form the page # Crop the image form the page