diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 5f35b520..02852a6d 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -211,7 +211,8 @@ class PicDescVlmOptions(PicDescBaseOptions): repo_id: str prompt: str = "Describe this image in a few sentences." - max_new_tokens: int = 200 + # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig + generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False) # class PicDescSmolVlmOptions(PicDescVlmOptions): diff --git a/docling/models/pic_description_vlm_model.py b/docling/models/pic_description_vlm_model.py index 3103c405..812da4bc 100644 --- a/docling/models/pic_description_vlm_model.py +++ b/docling/models/pic_description_vlm_model.py @@ -69,6 +69,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): return Path(download_path) def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: + from transformers import GenerationConfig # Create input messages messages = [ @@ -81,7 +82,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): }, ] - # TODO: set seed for reproducibility # TODO: do batch generation for image in images: @@ -94,7 +94,8 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): # Generate outputs generated_ids = self.model.generate( - **inputs, max_new_tokens=self.options.max_new_tokens + **inputs, + generation_config=GenerationConfig(**self.options.generation_config), ) generated_texts = self.processor.batch_decode( generated_ids[:, inputs["input_ids"].shape[1] :],