rename model

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-02-07 09:50:49 +01:00
parent f0ed5aca1e
commit 4e11ce62bb
7 changed files with 37 additions and 35 deletions

View File

@ -184,7 +184,7 @@ class OcrMacOptions(OcrOptions):
) )
class PicDescBaseOptions(BaseModel): class PictureDescriptionBaseOptions(BaseModel):
kind: str kind: str
batch_size: int = 8 batch_size: int = 8
scale: float = 2 scale: float = 2
@ -194,7 +194,7 @@ class PicDescBaseOptions(BaseModel):
) )
class PicDescApiOptions(PicDescBaseOptions): class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
kind: Literal["api"] = "api" kind: Literal["api"] = "api"
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions") url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
@ -206,7 +206,7 @@ class PicDescApiOptions(PicDescBaseOptions):
provenance: str = "" provenance: str = ""
class PicDescVlmOptions(PicDescBaseOptions): class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
kind: Literal["vlm"] = "vlm" kind: Literal["vlm"] = "vlm"
repo_id: str repo_id: str
@ -215,18 +215,11 @@ class PicDescVlmOptions(PicDescBaseOptions):
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False) generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
# class PicDescSmolVlmOptions(PicDescVlmOptions): smolvlm_picture_description = PictureDescriptionVlmOptions(
# repo_id: str = "HuggingFaceTB/SmolVLM-256M-Instruct" repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
)
# phi_pic_desc = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
# class PicDescGraniteOptions(PicDescVlmOptions): granite_picture_description = PictureDescriptionVlmOptions(
# repo_id: str = "ibm-granite/granite-vision-3.1-2b-preview"
# prompt: str = "What is shown in this image?"
smolvlm_pic_desc = PicDescVlmOptions(repo_id="HuggingFaceTB/SmolVLM-256M-Instruct")
# phi_pic_desc = PicDescVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
granite_pic_desc = PicDescVlmOptions(
repo_id="ibm-granite/granite-vision-3.1-2b-preview", repo_id="ibm-granite/granite-vision-3.1-2b-preview",
prompt="What is shown in this image?", prompt="What is shown in this image?",
) )
@ -282,8 +275,9 @@ class PdfPipelineOptions(PipelineOptions):
RapidOcrOptions, RapidOcrOptions,
] = Field(EasyOcrOptions(), discriminator="kind") ] = Field(EasyOcrOptions(), discriminator="kind")
picture_description_options: Annotated[ picture_description_options: Annotated[
Union[PicDescApiOptions, PicDescVlmOptions], Field(discriminator="kind") Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
] = smolvlm_pic_desc Field(discriminator="kind"),
] = smolvlm_picture_description
images_scale: float = 1.0 images_scale: float = 1.0
generate_page_images: bool = False generate_page_images: bool = False

View File

@ -11,7 +11,7 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
from PIL import Image from PIL import Image
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict
from docling.datamodel.pipeline_options import PicDescApiOptions from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
from docling.models.pic_description_base_model import PictureDescriptionBaseModel from docling.models.pic_description_base_model import PictureDescriptionBaseModel
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -49,9 +49,9 @@ class ApiResponse(BaseModel):
class PictureDescriptionApiModel(PictureDescriptionBaseModel): class PictureDescriptionApiModel(PictureDescriptionBaseModel):
# elements_batch_size = 4 # elements_batch_size = 4
def __init__(self, enabled: bool, options: PicDescApiOptions): def __init__(self, enabled: bool, options: PictureDescriptionApiOptions):
super().__init__(enabled=enabled, options=options) super().__init__(enabled=enabled, options=options)
self.options: PicDescApiOptions self.options: PictureDescriptionApiOptions
if self.enabled: if self.enabled:
if options.url.host != "localhost": if options.url.host != "localhost":

View File

@ -13,7 +13,7 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
) )
from PIL import Image from PIL import Image
from docling.datamodel.pipeline_options import PicDescBaseOptions from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
from docling.models.base_model import ( from docling.models.base_model import (
BaseItemAndImageEnrichmentModel, BaseItemAndImageEnrichmentModel,
ItemAndImageEnrichmentElement, ItemAndImageEnrichmentElement,
@ -26,7 +26,7 @@ class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
options: PicDescBaseOptions, options: PictureDescriptionBaseOptions,
): ):
self.enabled = enabled self.enabled = enabled
self.options = options self.options = options

View File

@ -4,7 +4,10 @@ from typing import Iterable, List, Optional, Union
from PIL import Image from PIL import Image
from docling.datamodel.pipeline_options import AcceleratorOptions, PicDescVlmOptions from docling.datamodel.pipeline_options import (
AcceleratorOptions,
PictureDescriptionVlmOptions,
)
from docling.models.pic_description_base_model import PictureDescriptionBaseModel from docling.models.pic_description_base_model import PictureDescriptionBaseModel
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
@ -15,11 +18,11 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
self, self,
enabled: bool, enabled: bool,
artifacts_path: Optional[Union[Path, str]], artifacts_path: Optional[Union[Path, str]],
options: PicDescVlmOptions, options: PictureDescriptionVlmOptions,
accelerator_options: AcceleratorOptions, accelerator_options: AcceleratorOptions,
): ):
super().__init__(enabled=enabled, options=options) super().__init__(enabled=enabled, options=options)
self.options: PicDescVlmOptions self.options: PictureDescriptionVlmOptions
if self.enabled: if self.enabled:

View File

@ -13,8 +13,8 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrMacOptions, OcrMacOptions,
PdfPipelineOptions, PdfPipelineOptions,
PicDescApiOptions, PictureDescriptionApiOptions,
PicDescVlmOptions, PictureDescriptionVlmOptions,
RapidOcrOptions, RapidOcrOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
@ -191,14 +191,16 @@ class StandardPdfPipeline(PaginatedPipeline):
self, artifacts_path: Optional[Path] = None self, artifacts_path: Optional[Path] = None
) -> Optional[PictureDescriptionBaseModel]: ) -> Optional[PictureDescriptionBaseModel]:
if isinstance( if isinstance(
self.pipeline_options.picture_description_options, PicDescApiOptions self.pipeline_options.picture_description_options,
PictureDescriptionApiOptions,
): ):
return PictureDescriptionApiModel( return PictureDescriptionApiModel(
enabled=self.pipeline_options.do_picture_description, enabled=self.pipeline_options.do_picture_description,
options=self.pipeline_options.picture_description_options, options=self.pipeline_options.picture_description_options,
) )
elif isinstance( elif isinstance(
self.pipeline_options.picture_description_options, PicDescVlmOptions self.pipeline_options.picture_description_options,
PictureDescriptionVlmOptions,
): ):
return PictureDescriptionVlmModel( return PictureDescriptionVlmModel(
enabled=self.pipeline_options.do_picture_description, enabled=self.pipeline_options.do_picture_description,

View File

@ -4,10 +4,10 @@ from pathlib import Path
from docling_core.types.doc import PictureItem from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( # PicDescSmolVlmOptions, PicDescGraniteOptions from docling.datamodel.pipeline_options import (
PdfPipelineOptions, PdfPipelineOptions,
granite_pic_desc, granite_picture_description,
smolvlm_pic_desc, smolvlm_picture_description,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
@ -19,7 +19,7 @@ def main():
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = smolvlm_pic_desc pipeline_options.picture_description_options = smolvlm_picture_description
# pipeline_options.picture_description_options = granite_pic_desc # pipeline_options.picture_description_options = granite_pic_desc
pipeline_options.picture_description_options.prompt = ( pipeline_options.picture_description_options.prompt = (

View File

@ -4,7 +4,10 @@ from pathlib import Path
from docling_core.types.doc import PictureItem from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, PicDescApiOptions from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
PictureDescriptionApiOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
@ -19,7 +22,7 @@ def main():
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = PicDescApiOptions( pipeline_options.picture_description_options = PictureDescriptionApiOptions(
url="http://localhost:8000/v1/chat/completions", url="http://localhost:8000/v1/chat/completions",
params=dict( params=dict(
model="HuggingFaceTB/SmolVLM-256M-Instruct", model="HuggingFaceTB/SmolVLM-256M-Instruct",