feat: Describe pictures using vision models (#259)

* draft for picture description models

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* vlm description using AutoModelForVision2Seq

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add generation options

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update vlm API

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* allow only localhost traffic

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename model

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* do not run with vlm api

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* more renaming

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix examples path

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply CLI download login

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix name of cli argument

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use with_smolvlm in models download

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-02-07 16:30:42 +01:00
committed by GitHub
parent fba3cf9be7
commit 4cc6e3ea5e
14 changed files with 508 additions and 11 deletions

View File

@@ -59,6 +59,10 @@ onnxruntime = [
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
]
transformers = [
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
]
pillow = "^10.0.0"
tqdm = "^4.65.0"
@@ -121,6 +125,7 @@ torchvision = [
[tool.poetry.extras]
tesserocr = ["tesserocr"]
ocrmac = ["ocrmac"]
vlm = ["transformers"]
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
[tool.poetry.scripts]
@@ -162,7 +167,8 @@ module = [
"deepsearch_glm.*",
"lxml.*",
"bs4.*",
"huggingface_hub.*"
"huggingface_hub.*",
"transformers.*",
]
ignore_missing_imports = true