Properly propagating image data per page, together with predicted tags in VLM pipeline. This enables correct figure extraction and page numbers in provenances

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2025-01-13 15:21:19 +01:00
parent 01c46e24b1
commit 61bb9dbba2
2 changed files with 229 additions and 218 deletions

View File

@ -73,7 +73,7 @@ class VlmPipeline(PaginatedPipeline):
disable_progress_bars() disable_progress_bars()
# TODO download the correct model (private repo) # TODO: download the correct model (private repo)
download_path = snapshot_download( download_path = snapshot_download(
repo_id="ds4sd/xxx", repo_id="ds4sd/xxx",
force_download=force, force_download=force,
@ -95,30 +95,17 @@ class VlmPipeline(PaginatedPipeline):
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT): with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
# Read and concatenate the page doctags: # Read and concatenate the page doctags:
document_tags = "" # document_tags = ""
page_tags = []
page_images = []
for page in conv_res.pages: for page in conv_res.pages:
if page.predictions.doctags is not None: if page.predictions.doctags is not None:
document_tags += page.predictions.doctags.tag_string page_tags.append(page.predictions.doctags.tag_string)
page_images.append(page.image)
conv_res.document = self._turn_tags_into_doc(document_tags, page.image) conv_res.document = self._turn_tags_into_doc(page_tags, page_images)
"""
image_bytes = BytesIO()
if page.image:
page.image.save(image_bytes, format="PNG")
# TODO implement this function
conv_res.document = self._turn_tags_into_doc(
document_tags, image_bytes.getvalue()
)
# Generate page images in the output
if self.pipeline_options.generate_page_images:
for page in conv_res.pages:
assert page.image is not None
page_no = page.page_no + 1
conv_res.document.pages[page_no].image = ImageRef.from_pil(
page.image, dpi=int(72 * self.pipeline_options.images_scale)
)
"""
# Generate images of the requested element types # Generate images of the requested element types
if ( if (
self.pipeline_options.generate_picture_images self.pipeline_options.generate_picture_images
@ -153,9 +140,8 @@ class VlmPipeline(PaginatedPipeline):
return conv_res return conv_res
# def _turn_tags_into_doc(self, xml_content: str, image_bytes: bytes) -> (DoclingDocument, list):
def _turn_tags_into_doc( def _turn_tags_into_doc(
self, xml_content: str, pil_image: Optional[Image] = None self, full_doc_xml_content: list[str], pil_images: list[Image | None]
) -> DoclingDocument: ) -> DoclingDocument:
def extract_text(tag_content: str) -> str: def extract_text(tag_content: str) -> str:
return re.sub(r"<.*?>", "", tag_content).strip() return re.sub(r"<.*?>", "", tag_content).strip()
@ -251,7 +237,7 @@ class VlmPipeline(PaginatedPipeline):
cell_text = texts[i + 1] cell_text = texts[i + 1]
right_offset = 2 right_offset = 2
# TODO: Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span # Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
next_right_cell = texts[i + right_offset] next_right_cell = texts[i + right_offset]
next_bottom_cell = "" next_bottom_cell = ""
@ -333,212 +319,236 @@ class VlmPipeline(PaginatedPipeline):
doc = DoclingDocument(name="Example Document") doc = DoclingDocument(name="Example Document")
current_group = None current_group = None
lines = xml_content.split("\n")
# pil_image = input_image #Image.open(BytesIO(image_bytes))
bounding_boxes = []
for line in lines: for pg_idx, xml_content in enumerate(full_doc_xml_content):
line = line.strip() pil_image = pil_images[pg_idx]
line = line.replace("<doc_tag>", "") page_no = pg_idx + 1
if line.startswith("<paragraph>"): lines = xml_content.split("\n")
content = extract_text(line) # pil_image = input_image #Image.open(BytesIO(image_bytes))
prov_item = extract_bounding_box(line) bounding_boxes = []
if prov_item:
bounding_boxes.append((prov_item, "red"))
doc.add_text(
label=DocItemLabel.PARAGRAPH,
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
if prov_item
else None
),
)
elif line.startswith("<title>"):
content = extract_text(line)
prov_item = extract_bounding_box(line)
if prov_item:
bounding_boxes.append((prov_item, "blue"))
current_group = doc.add_group(label=GroupLabel.SECTION, name=content)
doc.add_text(
label=DocItemLabel.TITLE,
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
if prov_item
else None
),
)
elif line.startswith("<section-header>"): for line in lines:
content = extract_text(line) line = line.strip()
prov_item = extract_bounding_box(line) line = line.replace("<doc_tag>", "")
if prov_item: if line.startswith("<paragraph>"):
bounding_boxes.append((prov_item, "green")) content = extract_text(line)
current_group = doc.add_group(label=GroupLabel.SECTION, name=content) prov_item = extract_bounding_box(line)
doc.add_text( if prov_item:
label=DocItemLabel.SECTION_HEADER, bounding_boxes.append((prov_item, "red"))
text=content, doc.add_text(
parent=current_group, label=DocItemLabel.PARAGRAPH,
prov=( text=content,
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)] parent=current_group,
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1) prov=(
if prov_item # [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
else None ProvenanceItem(
), bbox=prov_item, charspan=(0, 0), page_no=page_no
) )
if prov_item
else None
),
)
elif line.startswith("<title>"):
content = extract_text(line)
prov_item = extract_bounding_box(line)
if prov_item:
bounding_boxes.append((prov_item, "blue"))
current_group = doc.add_group(
label=GroupLabel.SECTION, name=content
)
doc.add_text(
label=DocItemLabel.TITLE,
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
)
if prov_item
else None
),
)
elif line.startswith("<otsl>"): elif line.startswith("<section-header>"):
prov_item = extract_bounding_box(line) content = extract_text(line)
if prov_item: prov_item = extract_bounding_box(line)
bounding_boxes.append((prov_item, "aquamarine")) if prov_item:
bounding_boxes.append((prov_item, "green"))
current_group = doc.add_group(
label=GroupLabel.SECTION, name=content
)
doc.add_text(
label=DocItemLabel.SECTION_HEADER,
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
)
if prov_item
else None
),
)
table_data = parse_table_content(line) elif line.startswith("<otsl>"):
doc.add_table(data=table_data, parent=current_group) prov_item = extract_bounding_box(line)
if prov_item:
bounding_boxes.append((prov_item, "aquamarine"))
elif line.startswith("<footnote>"): table_data = parse_table_content(line)
content = extract_text(line) doc.add_table(data=table_data, parent=current_group)
prov_item = extract_bounding_box(line)
if prov_item:
bounding_boxes.append((prov_item, "orange"))
doc.add_text(
label=DocItemLabel.FOOTNOTE,
text=content,
parent=current_group,
prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
if prov_item
else None
),
)
elif line.startswith("<page-header>"): elif line.startswith("<footnote>"):
content = extract_text(line) content = extract_text(line)
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if prov_item: if prov_item:
bounding_boxes.append((prov_item, "purple")) bounding_boxes.append((prov_item, "orange"))
doc.add_text( doc.add_text(
label=DocItemLabel.PAGE_HEADER, label=DocItemLabel.FOOTNOTE,
text=content, text=content,
parent=current_group, parent=current_group,
prov=( prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)] # [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1) ProvenanceItem(
if prov_item bbox=prov_item, charspan=(0, 0), page_no=page_no
else None )
), if prov_item
) else None
),
)
elif line.startswith("<page-footer>"): elif line.startswith("<page-header>"):
content = extract_text(line) content = extract_text(line)
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if prov_item: if prov_item:
bounding_boxes.append((prov_item, "cyan")) bounding_boxes.append((prov_item, "purple"))
doc.add_text( doc.add_text(
label=DocItemLabel.PAGE_FOOTER, label=DocItemLabel.PAGE_HEADER,
text=content, text=content,
parent=current_group, parent=current_group,
prov=( prov=(
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)] # [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1) ProvenanceItem(
if prov_item bbox=prov_item, charspan=(0, 0), page_no=page_no
else None )
), if prov_item
) else None
),
)
elif line.startswith("<figure>"): elif line.startswith("<page-footer>"):
bbox = extract_bounding_box(line) content = extract_text(line)
if bbox: prov_item = extract_bounding_box(line)
bounding_boxes.append((bbox, "yellow")) if prov_item:
if pil_image: bounding_boxes.append((prov_item, "cyan"))
# Convert bounding box normalized to 0-100 into pixel coordinates for cropping doc.add_text(
width, height = pil_image.size label=DocItemLabel.PAGE_FOOTER,
crop_box = ( text=content,
int(bbox.l * width), parent=current_group,
int(bbox.t * height), prov=(
int(bbox.r * width), # [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
int(bbox.b * height), ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
)
if prov_item
else None
),
)
elif line.startswith("<figure>"):
bbox = extract_bounding_box(line)
if bbox:
bounding_boxes.append((bbox, "yellow"))
if pil_image:
# Convert bounding box normalized to 0-100 into pixel coordinates for cropping
width, height = pil_image.size
crop_box = (
int(bbox.l * width),
int(bbox.t * height),
int(bbox.r * width),
int(bbox.b * height),
)
cropped_image = pil_image.crop(crop_box)
doc.add_picture(
parent=current_group,
image=ImageRef.from_pil(image=cropped_image, dpi=300),
prov=ProvenanceItem(
bbox=bbox, charspan=(0, 0), page_no=page_no
),
)
else:
doc.add_picture(
parent=current_group,
prov=ProvenanceItem(
bbox=bbox, charspan=(0, 0), page_no=page_no
),
)
elif line.startswith("<list>"):
content = extract_text(line)
prov_item_inst = None
prov_item = extract_bounding_box(line)
if prov_item:
bounding_boxes.append((prov_item, "brown"))
prov_item_inst = ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
) )
doc.add_text(
label=DocItemLabel.LIST_ITEM,
text=content,
parent=current_group,
prov=prov_item_inst if prov_item_inst else None,
)
cropped_image = pil_image.crop(crop_box) elif line.startswith("<caption>"):
doc.add_picture( content = extract_text(line)
parent=current_group, prov_item_inst = None
image=ImageRef.from_pil(image=cropped_image, dpi=300), prov_item = extract_bounding_box(line)
prov=ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1), if prov_item:
bounding_boxes.append((prov_item, "magenta"))
prov_item_inst = ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
) )
else: doc.add_text(
doc.add_picture( label=DocItemLabel.PARAGRAPH,
parent=current_group, text=content,
prov=ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1), parent=current_group,
prov=prov_item_inst if prov_item_inst else None,
)
elif line.startswith("<checkbox-unselected>"):
content = extract_text(line)
prov_item_inst = None
prov_item = extract_bounding_box(line)
if prov_item:
bounding_boxes.append((prov_item, "gray"))
prov_item_inst = ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=page_no
) )
elif line.startswith("<list>"): doc.add_text(
content = extract_text(line) label=DocItemLabel.CHECKBOX_UNSELECTED,
prov_item_inst = None text=content,
prov_item = extract_bounding_box(line) parent=current_group,
if prov_item: prov=prov_item_inst if prov_item_inst else None,
bounding_boxes.append((prov_item, "brown"))
prov_item_inst = ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=1
) )
doc.add_text(
label=DocItemLabel.LIST_ITEM,
text=content,
parent=current_group,
prov=prov_item_inst if prov_item_inst else None,
)
elif line.startswith("<caption>"): elif line.startswith("<checkbox-selected>"):
content = extract_text(line) content = extract_text(line)
prov_item_inst = None prov_item_inst = None
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if prov_item: if prov_item:
bounding_boxes.append((prov_item, "magenta")) bounding_boxes.append((prov_item, "black"))
prov_item_inst = ProvenanceItem( prov_item_inst = ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=1 bbox=prov_item, charspan=(0, 0), page_no=page_no
)
doc.add_text(
label=DocItemLabel.CHECKBOX_SELECTED,
text=content,
parent=current_group,
prov=prov_item_inst if prov_item_inst else None,
) )
doc.add_text( # return doc, bounding_boxes
label=DocItemLabel.PARAGRAPH,
text=content,
parent=current_group,
prov=prov_item_inst if prov_item_inst else None,
)
elif line.startswith("<checkbox-unselected>"):
content = extract_text(line)
prov_item_inst = None
prov_item = extract_bounding_box(line)
if prov_item:
bounding_boxes.append((prov_item, "gray"))
prov_item_inst = ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=1
)
doc.add_text(
label=DocItemLabel.CHECKBOX_UNSELECTED,
text=content,
parent=current_group,
prov=prov_item_inst if prov_item_inst else None,
)
elif line.startswith("<checkbox-selected>"):
content = extract_text(line)
prov_item_inst = None
prov_item = extract_bounding_box(line)
if prov_item:
bounding_boxes.append((prov_item, "black"))
prov_item_inst = ProvenanceItem(
bbox=prov_item, charspan=(0, 0), page_no=1
)
doc.add_text(
label=DocItemLabel.CHECKBOX_SELECTED,
text=content,
parent=current_group,
prov=prov_item_inst if prov_item_inst else None,
)
# return doc, bounding_boxes
return doc return doc
@classmethod @classmethod

View File

@ -6,10 +6,11 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline from docling.pipeline.vlm_pipeline import VlmPipeline
# source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
# source = "tests/data/2305.03393v1-pg9-img.png" # source = "tests/data/2305.03393v1-pg9-img.png"
source = "tests/data/2305.03393v1-pg9.pdf" # source = "tests/data/2305.03393v1-pg9.pdf"
# source = "page.png" # source = "demo_data/page.png"
# source = "demo_data/original_tables.pdf"
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True pipeline_options.generate_page_images = True