From b1df461ca88768e611a34bc8317f158138dece93 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 11 Feb 2025 16:42:23 +0100 Subject: [PATCH] Added captions for the images for SmolDocling assembly code, improved provenance definition for all elements Signed-off-by: Maksym Lysak --- docling/pipeline/vlm_pipeline.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 26b8d2d8..60f0525e 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -399,6 +399,10 @@ class VlmPipeline(PaginatedPipeline): doc.add_table(data=table_data) elif tag_name == "picture": + text_caption_content = extract_inner_text(full_chunk) + print("----------- TEXT CONTENT OF A PICTURE TAG -------------") + print(text_caption_content) + print("-------------------------------------------------------") if image: if bbox: width, height = image.size @@ -409,7 +413,7 @@ class VlmPipeline(PaginatedPipeline): int(bbox.b * height), ) cropped_image = image.crop(crop_box) - doc.add_picture( + pic = doc.add_picture( parent=None, image=ImageRef.from_pil(image=cropped_image, dpi=72), prov=( @@ -418,18 +422,35 @@ class VlmPipeline(PaginatedPipeline): ) ), ) + # If there is a caption to an image, add it as well + if len(text_caption_content) > 0: + caption_item = doc.add_text( + label=DocItemLabel.CAPTION, + text=text_caption_content, + parent=None, + ) + pic.captions.append(caption_item.get_ref()) else: if bbox: + # In case we don't have access to an binary of an image doc.add_picture( parent=None, prov=ProvenanceItem( bbox=bbox, charspan=(0, 0), page_no=page_no ), ) + # If there is a caption to an image, add it as well + if len(text_caption_content) > 0: + caption_item = doc.add_text( + label=DocItemLabel.CAPTION, + text=text_caption_content, + parent=None, + ) + pic.captions.append(caption_item.get_ref()) else: # For everything else, treat as text if self.force_backend_text: - content = extract_text_from_backend(page, bbox) + text_content = extract_text_from_backend(page, bbox) else: text_content = extract_inner_text(full_chunk) # If it's code, wrap it with
 tags
@@ -439,7 +460,11 @@ class VlmPipeline(PaginatedPipeline):
                         label=doc_label,
                         text=text_content,
                         prov=(
-                            ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=page_no)
+                            ProvenanceItem(
+                                bbox=bbox,
+                                charspan=(0, len(text_content)),
+                                page_no=page_no,
+                            )
                             if bbox
                             else None
                         ),