rename model

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-02-07 09:50:49 +01:00
parent f0ed5aca1e
commit 4e11ce62bb
7 changed files with 37 additions and 35 deletions

View File

@ -184,7 +184,7 @@ class OcrMacOptions(OcrOptions):
)
class PicDescBaseOptions(BaseModel):
class PictureDescriptionBaseOptions(BaseModel):
kind: str
batch_size: int = 8
scale: float = 2
@ -194,7 +194,7 @@ class PicDescBaseOptions(BaseModel):
)
class PicDescApiOptions(PicDescBaseOptions):
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
kind: Literal["api"] = "api"
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
@ -206,7 +206,7 @@ class PicDescApiOptions(PicDescBaseOptions):
provenance: str = ""
class PicDescVlmOptions(PicDescBaseOptions):
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
kind: Literal["vlm"] = "vlm"
repo_id: str
@ -215,18 +215,11 @@ class PicDescVlmOptions(PicDescBaseOptions):
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
# class PicDescSmolVlmOptions(PicDescVlmOptions):
# repo_id: str = "HuggingFaceTB/SmolVLM-256M-Instruct"
# class PicDescGraniteOptions(PicDescVlmOptions):
# repo_id: str = "ibm-granite/granite-vision-3.1-2b-preview"
# prompt: str = "What is shown in this image?"
smolvlm_pic_desc = PicDescVlmOptions(repo_id="HuggingFaceTB/SmolVLM-256M-Instruct")
# phi_pic_desc = PicDescVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
granite_pic_desc = PicDescVlmOptions(
smolvlm_picture_description = PictureDescriptionVlmOptions(
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
)
# phi_pic_desc = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
granite_picture_description = PictureDescriptionVlmOptions(
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
prompt="What is shown in this image?",
)
@ -282,8 +275,9 @@ class PdfPipelineOptions(PipelineOptions):
RapidOcrOptions,
] = Field(EasyOcrOptions(), discriminator="kind")
picture_description_options: Annotated[
Union[PicDescApiOptions, PicDescVlmOptions], Field(discriminator="kind")
] = smolvlm_pic_desc
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
Field(discriminator="kind"),
] = smolvlm_picture_description
images_scale: float = 1.0
generate_page_images: bool = False

View File

@ -11,7 +11,7 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
from PIL import Image
from pydantic import BaseModel, ConfigDict
from docling.datamodel.pipeline_options import PicDescApiOptions
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
from docling.models.pic_description_base_model import PictureDescriptionBaseModel
_log = logging.getLogger(__name__)
@ -49,9 +49,9 @@ class ApiResponse(BaseModel):
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
# elements_batch_size = 4
def __init__(self, enabled: bool, options: PicDescApiOptions):
def __init__(self, enabled: bool, options: PictureDescriptionApiOptions):
super().__init__(enabled=enabled, options=options)
self.options: PicDescApiOptions
self.options: PictureDescriptionApiOptions
if self.enabled:
if options.url.host != "localhost":

View File

@ -13,7 +13,7 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
)
from PIL import Image
from docling.datamodel.pipeline_options import PicDescBaseOptions
from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
from docling.models.base_model import (
BaseItemAndImageEnrichmentModel,
ItemAndImageEnrichmentElement,
@ -26,7 +26,7 @@ class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
def __init__(
self,
enabled: bool,
options: PicDescBaseOptions,
options: PictureDescriptionBaseOptions,
):
self.enabled = enabled
self.options = options

View File

@ -4,7 +4,10 @@ from typing import Iterable, List, Optional, Union
from PIL import Image
from docling.datamodel.pipeline_options import AcceleratorOptions, PicDescVlmOptions
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
PictureDescriptionVlmOptions,
)
from docling.models.pic_description_base_model import PictureDescriptionBaseModel
from docling.utils.accelerator_utils import decide_device
@ -15,11 +18,11 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
self,
enabled: bool,
artifacts_path: Optional[Union[Path, str]],
options: PicDescVlmOptions,
options: PictureDescriptionVlmOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
self.options: PicDescVlmOptions
self.options: PictureDescriptionVlmOptions
if self.enabled:

View File

@ -13,8 +13,8 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
PicDescApiOptions,
PicDescVlmOptions,
PictureDescriptionApiOptions,
PictureDescriptionVlmOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
@ -191,14 +191,16 @@ class StandardPdfPipeline(PaginatedPipeline):
self, artifacts_path: Optional[Path] = None
) -> Optional[PictureDescriptionBaseModel]:
if isinstance(
self.pipeline_options.picture_description_options, PicDescApiOptions
self.pipeline_options.picture_description_options,
PictureDescriptionApiOptions,
):
return PictureDescriptionApiModel(
enabled=self.pipeline_options.do_picture_description,
options=self.pipeline_options.picture_description_options,
)
elif isinstance(
self.pipeline_options.picture_description_options, PicDescVlmOptions
self.pipeline_options.picture_description_options,
PictureDescriptionVlmOptions,
):
return PictureDescriptionVlmModel(
enabled=self.pipeline_options.do_picture_description,

View File

@ -4,10 +4,10 @@ from pathlib import Path
from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( # PicDescSmolVlmOptions, PicDescGraniteOptions
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
granite_pic_desc,
smolvlm_pic_desc,
granite_picture_description,
smolvlm_picture_description,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
@ -19,7 +19,7 @@ def main():
pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = smolvlm_pic_desc
pipeline_options.picture_description_options = smolvlm_picture_description
# pipeline_options.picture_description_options = granite_pic_desc
pipeline_options.picture_description_options.prompt = (

View File

@ -4,7 +4,10 @@ from pathlib import Path
from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, PicDescApiOptions
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
PictureDescriptionApiOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
@ -19,7 +22,7 @@ def main():
pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = PicDescApiOptions(
pipeline_options.picture_description_options = PictureDescriptionApiOptions(
url="http://localhost:8000/v1/chat/completions",
params=dict(
model="HuggingFaceTB/SmolVLM-256M-Instruct",