From 4e11ce62bb80408a652c01f0b67e169df294a4d8 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 7 Feb 2025 09:50:49 +0100 Subject: [PATCH] rename model Signed-off-by: Michele Dolfi --- docling/datamodel/pipeline_options.py | 28 ++++++++------------ docling/models/pic_description_api_model.py | 6 ++--- docling/models/pic_description_base_model.py | 4 +-- docling/models/pic_description_vlm_model.py | 9 ++++--- docling/pipeline/standard_pdf_pipeline.py | 10 ++++--- docs/examples/pictures_description.py | 8 +++--- docs/examples/pictures_description_api.py | 7 +++-- 7 files changed, 37 insertions(+), 35 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 7e7d5378..21d92b6d 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -184,7 +184,7 @@ class OcrMacOptions(OcrOptions): ) -class PicDescBaseOptions(BaseModel): +class PictureDescriptionBaseOptions(BaseModel): kind: str batch_size: int = 8 scale: float = 2 @@ -194,7 +194,7 @@ class PicDescBaseOptions(BaseModel): ) -class PicDescApiOptions(PicDescBaseOptions): +class PictureDescriptionApiOptions(PictureDescriptionBaseOptions): kind: Literal["api"] = "api" url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions") @@ -206,7 +206,7 @@ class PicDescApiOptions(PicDescBaseOptions): provenance: str = "" -class PicDescVlmOptions(PicDescBaseOptions): +class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions): kind: Literal["vlm"] = "vlm" repo_id: str @@ -215,18 +215,11 @@ class PicDescVlmOptions(PicDescBaseOptions): generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False) -# class PicDescSmolVlmOptions(PicDescVlmOptions): -# repo_id: str = "HuggingFaceTB/SmolVLM-256M-Instruct" - - -# class PicDescGraniteOptions(PicDescVlmOptions): -# repo_id: str = "ibm-granite/granite-vision-3.1-2b-preview" -# prompt: str = "What is shown in this image?" - - -smolvlm_pic_desc = PicDescVlmOptions(repo_id="HuggingFaceTB/SmolVLM-256M-Instruct") -# phi_pic_desc = PicDescVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct") -granite_pic_desc = PicDescVlmOptions( +smolvlm_picture_description = PictureDescriptionVlmOptions( + repo_id="HuggingFaceTB/SmolVLM-256M-Instruct" +) +# phi_pic_desc = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct") +granite_picture_description = PictureDescriptionVlmOptions( repo_id="ibm-granite/granite-vision-3.1-2b-preview", prompt="What is shown in this image?", ) @@ -282,8 +275,9 @@ class PdfPipelineOptions(PipelineOptions): RapidOcrOptions, ] = Field(EasyOcrOptions(), discriminator="kind") picture_description_options: Annotated[ - Union[PicDescApiOptions, PicDescVlmOptions], Field(discriminator="kind") - ] = smolvlm_pic_desc + Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions], + Field(discriminator="kind"), + ] = smolvlm_picture_description images_scale: float = 1.0 generate_page_images: bool = False diff --git a/docling/models/pic_description_api_model.py b/docling/models/pic_description_api_model.py index 85fc55eb..8b5cc7b5 100644 --- a/docling/models/pic_description_api_model.py +++ b/docling/models/pic_description_api_model.py @@ -11,7 +11,7 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co from PIL import Image from pydantic import BaseModel, ConfigDict -from docling.datamodel.pipeline_options import PicDescApiOptions +from docling.datamodel.pipeline_options import PictureDescriptionApiOptions from docling.models.pic_description_base_model import PictureDescriptionBaseModel _log = logging.getLogger(__name__) @@ -49,9 +49,9 @@ class ApiResponse(BaseModel): class PictureDescriptionApiModel(PictureDescriptionBaseModel): # elements_batch_size = 4 - def __init__(self, enabled: bool, options: PicDescApiOptions): + def __init__(self, enabled: bool, options: PictureDescriptionApiOptions): super().__init__(enabled=enabled, options=options) - self.options: PicDescApiOptions + self.options: PictureDescriptionApiOptions if self.enabled: if options.url.host != "localhost": diff --git a/docling/models/pic_description_base_model.py b/docling/models/pic_description_base_model.py index 9be9e678..b653e0e3 100644 --- a/docling/models/pic_description_base_model.py +++ b/docling/models/pic_description_base_model.py @@ -13,7 +13,7 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co ) from PIL import Image -from docling.datamodel.pipeline_options import PicDescBaseOptions +from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions from docling.models.base_model import ( BaseItemAndImageEnrichmentModel, ItemAndImageEnrichmentElement, @@ -26,7 +26,7 @@ class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel): def __init__( self, enabled: bool, - options: PicDescBaseOptions, + options: PictureDescriptionBaseOptions, ): self.enabled = enabled self.options = options diff --git a/docling/models/pic_description_vlm_model.py b/docling/models/pic_description_vlm_model.py index 812da4bc..0e9d2abc 100644 --- a/docling/models/pic_description_vlm_model.py +++ b/docling/models/pic_description_vlm_model.py @@ -4,7 +4,10 @@ from typing import Iterable, List, Optional, Union from PIL import Image -from docling.datamodel.pipeline_options import AcceleratorOptions, PicDescVlmOptions +from docling.datamodel.pipeline_options import ( + AcceleratorOptions, + PictureDescriptionVlmOptions, +) from docling.models.pic_description_base_model import PictureDescriptionBaseModel from docling.utils.accelerator_utils import decide_device @@ -15,11 +18,11 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): self, enabled: bool, artifacts_path: Optional[Union[Path, str]], - options: PicDescVlmOptions, + options: PictureDescriptionVlmOptions, accelerator_options: AcceleratorOptions, ): super().__init__(enabled=enabled, options=options) - self.options: PicDescVlmOptions + self.options: PictureDescriptionVlmOptions if self.enabled: diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index c0faaf0e..e3ef3379 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -13,8 +13,8 @@ from docling.datamodel.pipeline_options import ( EasyOcrOptions, OcrMacOptions, PdfPipelineOptions, - PicDescApiOptions, - PicDescVlmOptions, + PictureDescriptionApiOptions, + PictureDescriptionVlmOptions, RapidOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, @@ -191,14 +191,16 @@ class StandardPdfPipeline(PaginatedPipeline): self, artifacts_path: Optional[Path] = None ) -> Optional[PictureDescriptionBaseModel]: if isinstance( - self.pipeline_options.picture_description_options, PicDescApiOptions + self.pipeline_options.picture_description_options, + PictureDescriptionApiOptions, ): return PictureDescriptionApiModel( enabled=self.pipeline_options.do_picture_description, options=self.pipeline_options.picture_description_options, ) elif isinstance( - self.pipeline_options.picture_description_options, PicDescVlmOptions + self.pipeline_options.picture_description_options, + PictureDescriptionVlmOptions, ): return PictureDescriptionVlmModel( enabled=self.pipeline_options.do_picture_description, diff --git a/docs/examples/pictures_description.py b/docs/examples/pictures_description.py index d5f72045..b276b4d2 100644 --- a/docs/examples/pictures_description.py +++ b/docs/examples/pictures_description.py @@ -4,10 +4,10 @@ from pathlib import Path from docling_core.types.doc import PictureItem from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( # PicDescSmolVlmOptions, PicDescGraniteOptions +from docling.datamodel.pipeline_options import ( PdfPipelineOptions, - granite_pic_desc, - smolvlm_pic_desc, + granite_picture_description, + smolvlm_picture_description, ) from docling.document_converter import DocumentConverter, PdfFormatOption @@ -19,7 +19,7 @@ def main(): pipeline_options = PdfPipelineOptions() pipeline_options.do_picture_description = True - pipeline_options.picture_description_options = smolvlm_pic_desc + pipeline_options.picture_description_options = smolvlm_picture_description # pipeline_options.picture_description_options = granite_pic_desc pipeline_options.picture_description_options.prompt = ( diff --git a/docs/examples/pictures_description_api.py b/docs/examples/pictures_description_api.py index 7c42162c..0dec9ee7 100644 --- a/docs/examples/pictures_description_api.py +++ b/docs/examples/pictures_description_api.py @@ -4,7 +4,10 @@ from pathlib import Path from docling_core.types.doc import PictureItem from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import PdfPipelineOptions, PicDescApiOptions +from docling.datamodel.pipeline_options import ( + PdfPipelineOptions, + PictureDescriptionApiOptions, +) from docling.document_converter import DocumentConverter, PdfFormatOption @@ -19,7 +22,7 @@ def main(): pipeline_options = PdfPipelineOptions() pipeline_options.do_picture_description = True - pipeline_options.picture_description_options = PicDescApiOptions( + pipeline_options.picture_description_options = PictureDescriptionApiOptions( url="http://localhost:8000/v1/chat/completions", params=dict( model="HuggingFaceTB/SmolVLM-256M-Instruct",