From f2751e11f9c2d3e5b68fd8c55d79c4c4c544ffa4 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 21 Jan 2025 17:37:11 +0100 Subject: [PATCH] Introduced SmolDoclingOptions to configure model parameters (such as query and artifacts path) via client code, see example in minimal_smol_docling. Provisioning for other potential vlm all-in-one models. Signed-off-by: Maksym Lysak --- docling/datamodel/pipeline_options.py | 10 ++++++++++ docling/models/smol_docling_model.py | 20 +++++++++++++++----- docling/pipeline/vlm_pipeline.py | 5 ++--- docs/examples/minimal_smol_docling.py | 21 ++++++++++++++------- 4 files changed, 41 insertions(+), 15 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index f08a1bff..6ccd2f73 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -254,6 +254,14 @@ granite_picture_description = PictureDescriptionVlmOptions( ) +class SmolDoclingOptions(BaseModel): + artifacts_path: str = "" + question: str = "Perform Layout Analysis." + load_in_8bit: bool = True + llm_int8_threshold: float = 6.0 + quantized: bool = False + + # Define an enum for the backend options class PdfBackend(str, Enum): """Enum of valid PDF backends.""" @@ -313,6 +321,8 @@ class PdfPipelineOptions(PipelineOptions): Field(discriminator="kind"), ] = smolvlm_picture_description + vlm_options: Union[SmolDoclingOptions,] = Field(SmolDoclingOptions()) + images_scale: float = 1.0 generate_page_images: bool = False generate_picture_images: bool = False diff --git a/docling/models/smol_docling_model.py b/docling/models/smol_docling_model.py index 252ae0f2..86db3d7e 100644 --- a/docling/models/smol_docling_model.py +++ b/docling/models/smol_docling_model.py @@ -13,7 +13,11 @@ from transformers import ( # type: ignore from docling.datamodel.base_models import DocTagsPrediction, Page from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + SmolDoclingOptions, +) from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel from docling.utils.accelerator_utils import decide_device @@ -24,17 +28,23 @@ _log = logging.getLogger(__name__) class SmolDoclingModel(BasePageModel): - def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions): + def __init__( + self, + artifacts_path: Path, + accelerator_options: AcceleratorOptions, + vlm_options: SmolDoclingOptions, + ): device = decide_device(accelerator_options.device) self.device = device _log.info("Available device for SmolDocling: {}".format(device)) # PARAMETERS: - self.param_question = "Perform Layout Analysis." + self.param_question = vlm_options.question # "Perform Layout Analysis." self.param_quantization_config = BitsAndBytesConfig( - load_in_8bit=True, llm_int8_threshold=6.0 + load_in_8bit=vlm_options.load_in_8bit, # True, + llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0 ) - self.param_quantized = False + self.param_quantized = vlm_options.quantized # False self.processor = AutoProcessor.from_pretrained(artifacts_path) if not self.param_quantized: diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 38e3c983..c484ca24 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -36,8 +36,6 @@ _log = logging.getLogger(__name__) class VlmPipeline(PaginatedPipeline): - # _smol_vlm_path = "SmolDocling-0.0.2" - _smol_vlm_path = "SmolDocling_2.7_DT_0.7" def __init__(self, pipeline_options: PdfPipelineOptions): super().__init__(pipeline_options) @@ -60,8 +58,9 @@ class VlmPipeline(PaginatedPipeline): self.build_pipe = [ SmolDoclingModel( - artifacts_path=self.artifacts_path / VlmPipeline._smol_vlm_path, + artifacts_path=self.artifacts_path, accelerator_options=pipeline_options.accelerator_options, + vlm_options=self.pipeline_options.vlm_options, ), ] diff --git a/docs/examples/minimal_smol_docling.py b/docs/examples/minimal_smol_docling.py index 6e95bbf4..a3d36a30 100644 --- a/docs/examples/minimal_smol_docling.py +++ b/docs/examples/minimal_smol_docling.py @@ -8,24 +8,31 @@ import yaml from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.pipeline_options import PdfPipelineOptions, SmolDoclingOptions from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline sources = [ - # "https://arxiv.org/pdf/2408.09869" + # "https://arxiv.org/pdf/2408.09869", # "tests/data/2305.03393v1-pg9-img.png", "tests/data/2305.03393v1-pg9.pdf", - # "demo_data/page.png", - # "demo_data/original_tables.pdf", ] pipeline_options = PdfPipelineOptions() pipeline_options.generate_page_images = True -pipeline_options.force_backend_text = ( - False # If True, text from backend will be used instead of generated text +# If force_backend_text = True, text from backend will be used instead of generated text +pipeline_options.force_backend_text = False +pipeline_options.artifacts_path = "model_artifacts/SmolDocling_2.7_DT_0.7" + +vlm_options = SmolDoclingOptions( + artifacts_path="model_artifacts/SmolDocling_2.7_DT_0.7", + question="Perform Layout Analysis.", + load_in_8bit=True, + llm_int8_threshold=6.0, + quantized=False, ) -pipeline_options.artifacts_path = "model_artifacts" + +pipeline_options.vlm_options = vlm_options from docling_core.types.doc import DocItemLabel, ImageRefMode from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS