fix: Ensure proper image_scale for generated page images in VLM pipelines (#2728)

* fix: Ensure proper image_scale is used for generated page images in layout+vlm pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Ensure proper image_scale output in default VLM pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-12-05 13:16:11 +01:00
committed by GitHub
parent d007ba0e6f
commit 609069d12c
3 changed files with 8 additions and 2 deletions

View File

@@ -232,9 +232,12 @@ class ThreadedLayoutVlmPipeline(BasePipeline):
# Initialize pages
start_page, end_page = conv_res.input.limits.page_range
pages: List[Page] = []
images_scale = self.pipeline_options.images_scale
for i in range(conv_res.input.page_count):
if start_page - 1 <= i <= end_page - 1:
page = Page(page_no=i)
if images_scale is not None:
page._default_image_scale = images_scale
page._backend = backend.load_page(i)
if page._backend and page._backend.is_valid():
page.size = page._backend.get_size()

View File

@@ -114,6 +114,9 @@ class VlmPipeline(PaginatedPipeline):
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
with TimeRecorder(conv_res, "page_init"):
images_scale = self.pipeline_options.images_scale
if images_scale is not None:
page._default_image_scale = images_scale
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size()

View File

@@ -113,7 +113,7 @@ def demo_threaded_layout_vlm_pipeline(
# Queue configuration
queue_max_size=10,
# Image processing
images_scale=2.0,
images_scale=vlm_options.scale,
generate_page_images=True,
enable_remote_services=use_api_vlm,
)
@@ -142,7 +142,7 @@ def demo_threaded_layout_vlm_pipeline(
)
result_layout_aware.document.save_as_html(
out_dir_layout_aware / f"{doc_filename}.html"
out_dir_layout_aware / f"{doc_filename}.html", split_page_view=True
)
for page in result_layout_aware.pages:
_log.info("Page %s of VLM response:", page.page_no)