fix: Use proper page concatentation in VLM pipeline MD/HTML conversion (#2458)

* Use proper page concatentation in VLM pipeline MD/HTML conversion

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-10-14 14:12:26 +02:00
committed by GitHub
parent 3687d865f8
commit cd7f7ba145

View File

@@ -6,6 +6,7 @@ from typing import List, Optional, Union, cast
from docling_core.types.doc import ( from docling_core.types.doc import (
BoundingBox, BoundingBox,
ContentLayer,
DocItem, DocItem,
DoclingDocument, DoclingDocument,
ImageRef, ImageRef,
@@ -251,9 +252,9 @@ class VlmPipeline(PaginatedPipeline):
# No code blocks found, return original text # No code blocks found, return original text
return text return text
for pg_idx, page in enumerate(conv_res.pages): page_docs = []
page_no = pg_idx + 1 # FIXME: might be incorrect
for pg_idx, page in enumerate(conv_res.pages):
predicted_text = "" predicted_text = ""
if page.predictions.vlm_response: if page.predictions.vlm_response:
predicted_text = page.predictions.vlm_response.text + "\n\n" predicted_text = page.predictions.vlm_response.text + "\n\n"
@@ -273,6 +274,24 @@ class VlmPipeline(PaginatedPipeline):
) )
page_doc = backend.convert() page_doc = backend.convert()
# Modify provenance in place for all items in the page document
for item, level in page_doc.iterate_items(
with_groups=True,
traverse_pictures=True,
included_content_layers=set(ContentLayer),
):
if isinstance(item, DocItem):
item.prov = [
ProvenanceItem(
page_no=pg_idx + 1,
bbox=BoundingBox(
t=0.0, b=0.0, l=0.0, r=0.0
), # FIXME: would be nice not to have to "fake" it
charspan=[0, 0],
)
]
# Add page metadata to the page document before concatenation
if page.image is not None: if page.image is not None:
pg_width = page.image.width pg_width = page.image.width
pg_height = page.image.height pg_height = page.image.height
@@ -280,27 +299,18 @@ class VlmPipeline(PaginatedPipeline):
pg_width = 1 pg_width = 1
pg_height = 1 pg_height = 1
conv_res.document.add_page( page_doc.add_page(
page_no=page_no, page_no=pg_idx + 1,
size=Size(width=pg_width, height=pg_height), size=Size(width=pg_width, height=pg_height),
image=ImageRef.from_pil(image=page.image, dpi=72) image=ImageRef.from_pil(image=page.image, dpi=72)
if page.image if page.image
else None, else None,
) )
for item, level in page_doc.iterate_items(): page_docs.append(page_doc)
item.prov = [
ProvenanceItem(
page_no=pg_idx + 1,
bbox=BoundingBox(
t=0.0, b=0.0, l=0.0, r=0.0
), # FIXME: would be nice not to have to "fake" it
charspan=[0, 0],
)
]
conv_res.document.append_child_item(child=item)
return conv_res.document final_doc = DoclingDocument.concatenate(docs=page_docs)
return final_doc
def _turn_html_into_doc(self, conv_res): def _turn_html_into_doc(self, conv_res):
def _extract_html_code(text): def _extract_html_code(text):
@@ -328,9 +338,9 @@ class VlmPipeline(PaginatedPipeline):
# No code blocks found, return original text # No code blocks found, return original text
return text return text
for pg_idx, page in enumerate(conv_res.pages): page_docs = []
page_no = pg_idx + 1 # FIXME: might be incorrect
for pg_idx, page in enumerate(conv_res.pages):
predicted_text = "" predicted_text = ""
if page.predictions.vlm_response: if page.predictions.vlm_response:
predicted_text = page.predictions.vlm_response.text + "\n\n" predicted_text = page.predictions.vlm_response.text + "\n\n"
@@ -341,7 +351,7 @@ class VlmPipeline(PaginatedPipeline):
out_doc = InputDocument( out_doc = InputDocument(
path_or_stream=response_bytes, path_or_stream=response_bytes,
filename=conv_res.input.file.name, filename=conv_res.input.file.name,
format=InputFormat.MD, format=InputFormat.HTML,
backend=HTMLDocumentBackend, backend=HTMLDocumentBackend,
) )
backend = HTMLDocumentBackend( backend = HTMLDocumentBackend(
@@ -350,6 +360,24 @@ class VlmPipeline(PaginatedPipeline):
) )
page_doc = backend.convert() page_doc = backend.convert()
# Modify provenance in place for all items in the page document
for item, level in page_doc.iterate_items(
with_groups=True,
traverse_pictures=True,
included_content_layers=set(ContentLayer),
):
if isinstance(item, DocItem):
item.prov = [
ProvenanceItem(
page_no=pg_idx + 1,
bbox=BoundingBox(
t=0.0, b=0.0, l=0.0, r=0.0
), # FIXME: would be nice not to have to "fake" it
charspan=[0, 0],
)
]
# Add page metadata to the page document before concatenation
if page.image is not None: if page.image is not None:
pg_width = page.image.width pg_width = page.image.width
pg_height = page.image.height pg_height = page.image.height
@@ -357,27 +385,19 @@ class VlmPipeline(PaginatedPipeline):
pg_width = 1 pg_width = 1
pg_height = 1 pg_height = 1
conv_res.document.add_page( page_doc.add_page(
page_no=page_no, page_no=pg_idx + 1,
size=Size(width=pg_width, height=pg_height), size=Size(width=pg_width, height=pg_height),
image=ImageRef.from_pil(image=page.image, dpi=72) image=ImageRef.from_pil(image=page.image, dpi=72)
if page.image if page.image
else None, else None,
) )
for item, level in page_doc.iterate_items(): page_docs.append(page_doc)
item.prov = [
ProvenanceItem(
page_no=pg_idx + 1,
bbox=BoundingBox(
t=0.0, b=0.0, l=0.0, r=0.0
), # FIXME: would be nice not to have to "fake" it
charspan=[0, 0],
)
]
conv_res.document.append_child_item(child=item)
return conv_res.document # Concatenate all page documents to preserve hierarchy
final_doc = DoclingDocument.concatenate(docs=page_docs)
return final_doc
@classmethod @classmethod
def get_default_options(cls) -> VlmPipelineOptions: def get_default_options(cls) -> VlmPipelineOptions: