Added captions for the images for SmolDocling assembly code, improved provenance definition for all elements

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-12-09 05:08:14 +00:00 · 2025-02-11 16:42:23 +01:00
parent d7abe1b1cd
commit b1df461ca8
1 changed files with 28 additions and 3 deletions
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -399,6 +399,10 @@ class VlmPipeline(PaginatedPipeline):
                    doc.add_table(data=table_data)

                elif tag_name == "picture":
+                    text_caption_content = extract_inner_text(full_chunk)
+                    print("----------- TEXT CONTENT OF A PICTURE TAG -------------")
+                    print(text_caption_content)
+                    print("-------------------------------------------------------")
                    if image:
                        if bbox:
                            width, height = image.size
@@ -409,7 +413,7 @@ class VlmPipeline(PaginatedPipeline):
                                int(bbox.b * height),
                            )
                            cropped_image = image.crop(crop_box)
-                            doc.add_picture(
+                            pic = doc.add_picture(
                                parent=None,
                                image=ImageRef.from_pil(image=cropped_image, dpi=72),
                                prov=(
@@ -418,18 +422,35 @@ class VlmPipeline(PaginatedPipeline):
                                    )
                                ),
                            )
+                            # If there is a caption to an image, add it as well
+                            if len(text_caption_content) > 0:
+                                caption_item = doc.add_text(
+                                    label=DocItemLabel.CAPTION,
+                                    text=text_caption_content,
+                                    parent=None,
+                                )
+                                pic.captions.append(caption_item.get_ref())
                    else:
                        if bbox:
+                            # In case we don't have access to an binary of an image
                            doc.add_picture(
                                parent=None,
                                prov=ProvenanceItem(
                                    bbox=bbox, charspan=(0, 0), page_no=page_no
                                ),
                            )
+                            # If there is a caption to an image, add it as well
+                            if len(text_caption_content) > 0:
+                                caption_item = doc.add_text(
+                                    label=DocItemLabel.CAPTION,
+                                    text=text_caption_content,
+                                    parent=None,
+                                )
+                                pic.captions.append(caption_item.get_ref())
                else:
                    # For everything else, treat as text
                    if self.force_backend_text:
-                        content = extract_text_from_backend(page, bbox)
+                        text_content = extract_text_from_backend(page, bbox)
                    else:
                        text_content = extract_inner_text(full_chunk)
                    # If it's code, wrap it with <pre><code> tags
@@ -439,7 +460,11 @@ class VlmPipeline(PaginatedPipeline):
                        label=doc_label,
                        text=text_content,
                        prov=(
-                            ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=page_no)
+                            ProvenanceItem(
+                                bbox=bbox,
+                                charspan=(0, len(text_content)),
+                                page_no=page_no,
+                            )
                            if bbox
                            else None
                        ),