feat(vlm): Ability to preprocess VLM response (#1907)

* Add ability to preprocess VLM response

Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com>

* Move response decoding to vlm options (requires inheritance to override). Per-page prompt formulation also moved to vlm options to keep api consistent.

Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com>

---------

Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com>
This commit is contained in:
Shkarupa Alex
2025-08-12 16:20:24 +03:00
committed by GitHub
parent ccfee05847
commit 5f050f94e1
5 changed files with 60 additions and 54 deletions

View File

@@ -84,10 +84,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB")
if callable(self.vlm_options.prompt):
user_prompt = self.vlm_options.prompt(page.parsed_page)
else:
user_prompt = self.vlm_options.prompt
user_prompt = self.vlm_options.build_prompt(page.parsed_page)
prompt = self.apply_chat_template(
self.processor, self.config, user_prompt, num_images=1
)
@@ -142,6 +139,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
_log.debug(
f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
)
page_tags = self.vlm_options.decode_response(page_tags)
page.predictions.vlm_response = VlmPrediction(
text=page_tags,
generation_time=generation_time,