From b1df461ca88768e611a34bc8317f158138dece93 Mon Sep 17 00:00:00 2001
From: Maksym Lysak <mly@zurich.ibm.com>
Date: Tue, 11 Feb 2025 16:42:23 +0100
Subject: [PATCH] Added captions for the images for SmolDocling assembly code,
 improved provenance definition for all elements

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
---
 docling/pipeline/vlm_pipeline.py | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 26b8d2d8..60f0525e 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -399,6 +399,10 @@ class VlmPipeline(PaginatedPipeline):
                     doc.add_table(data=table_data)
 
                 elif tag_name == "picture":
+                    text_caption_content = extract_inner_text(full_chunk)
+                    print("----------- TEXT CONTENT OF A PICTURE TAG -------------")
+                    print(text_caption_content)
+                    print("-------------------------------------------------------")
                     if image:
                         if bbox:
                             width, height = image.size
@@ -409,7 +413,7 @@ class VlmPipeline(PaginatedPipeline):
                                 int(bbox.b * height),
                             )
                             cropped_image = image.crop(crop_box)
-                            doc.add_picture(
+                            pic = doc.add_picture(
                                 parent=None,
                                 image=ImageRef.from_pil(image=cropped_image, dpi=72),
                                 prov=(
@@ -418,18 +422,35 @@ class VlmPipeline(PaginatedPipeline):
                                     )
                                 ),
                             )
+                            # If there is a caption to an image, add it as well
+                            if len(text_caption_content) > 0:
+                                caption_item = doc.add_text(
+                                    label=DocItemLabel.CAPTION,
+                                    text=text_caption_content,
+                                    parent=None,
+                                )
+                                pic.captions.append(caption_item.get_ref())
                     else:
                         if bbox:
+                            # In case we don't have access to an binary of an image
                             doc.add_picture(
                                 parent=None,
                                 prov=ProvenanceItem(
                                     bbox=bbox, charspan=(0, 0), page_no=page_no
                                 ),
                             )
+                            # If there is a caption to an image, add it as well
+                            if len(text_caption_content) > 0:
+                                caption_item = doc.add_text(
+                                    label=DocItemLabel.CAPTION,
+                                    text=text_caption_content,
+                                    parent=None,
+                                )
+                                pic.captions.append(caption_item.get_ref())
                 else:
                     # For everything else, treat as text
                     if self.force_backend_text:
-                        content = extract_text_from_backend(page, bbox)
+                        text_content = extract_text_from_backend(page, bbox)
                     else:
                         text_content = extract_inner_text(full_chunk)
                     # If it's code, wrap it with <pre><code> tags
@@ -439,7 +460,11 @@ class VlmPipeline(PaginatedPipeline):
                         label=doc_label,
                         text=text_content,
                         prov=(
-                            ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=page_no)
+                            ProvenanceItem(
+                                bbox=bbox,
+                                charspan=(0, len(text_content)),
+                                page_no=page_no,
+                            )
                             if bbox
                             else None
                         ),