Introduced SmolDoclingOptions to configure model parameters (such as query and artifacts path) via client code, see example in minimal_smol_docling. Provisioning for other potential vlm all-in-one models.

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2025-01-21 17:37:11 +01:00
parent 88b9ac6706
commit f2751e11f9
4 changed files with 41 additions and 15 deletions

View File

@ -254,6 +254,14 @@ granite_picture_description = PictureDescriptionVlmOptions(
)
class SmolDoclingOptions(BaseModel):
artifacts_path: str = ""
question: str = "Perform Layout Analysis."
load_in_8bit: bool = True
llm_int8_threshold: float = 6.0
quantized: bool = False
# Define an enum for the backend options
class PdfBackend(str, Enum):
"""Enum of valid PDF backends."""
@ -313,6 +321,8 @@ class PdfPipelineOptions(PipelineOptions):
Field(discriminator="kind"),
] = smolvlm_picture_description
vlm_options: Union[SmolDoclingOptions,] = Field(SmolDoclingOptions())
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False

View File

@ -13,7 +13,11 @@ from transformers import ( # type: ignore
from docling.datamodel.base_models import DocTagsPrediction, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
SmolDoclingOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
@ -24,17 +28,23 @@ _log = logging.getLogger(__name__)
class SmolDoclingModel(BasePageModel):
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
def __init__(
self,
artifacts_path: Path,
accelerator_options: AcceleratorOptions,
vlm_options: SmolDoclingOptions,
):
device = decide_device(accelerator_options.device)
self.device = device
_log.info("Available device for SmolDocling: {}".format(device))
# PARAMETERS:
self.param_question = "Perform Layout Analysis."
self.param_question = vlm_options.question # "Perform Layout Analysis."
self.param_quantization_config = BitsAndBytesConfig(
load_in_8bit=True, llm_int8_threshold=6.0
load_in_8bit=vlm_options.load_in_8bit, # True,
llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0
)
self.param_quantized = False
self.param_quantized = vlm_options.quantized # False
self.processor = AutoProcessor.from_pretrained(artifacts_path)
if not self.param_quantized:

View File

@ -36,8 +36,6 @@ _log = logging.getLogger(__name__)
class VlmPipeline(PaginatedPipeline):
# _smol_vlm_path = "SmolDocling-0.0.2"
_smol_vlm_path = "SmolDocling_2.7_DT_0.7"
def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
@ -60,8 +58,9 @@ class VlmPipeline(PaginatedPipeline):
self.build_pipe = [
SmolDoclingModel(
artifacts_path=self.artifacts_path / VlmPipeline._smol_vlm_path,
artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=self.pipeline_options.vlm_options,
),
]

View File

@ -8,24 +8,31 @@ import yaml
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.pipeline_options import PdfPipelineOptions, SmolDoclingOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
sources = [
# "https://arxiv.org/pdf/2408.09869"
# "https://arxiv.org/pdf/2408.09869",
# "tests/data/2305.03393v1-pg9-img.png",
"tests/data/2305.03393v1-pg9.pdf",
# "demo_data/page.png",
# "demo_data/original_tables.pdf",
]
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True
pipeline_options.force_backend_text = (
False # If True, text from backend will be used instead of generated text
# If force_backend_text = True, text from backend will be used instead of generated text
pipeline_options.force_backend_text = False
pipeline_options.artifacts_path = "model_artifacts/SmolDocling_2.7_DT_0.7"
vlm_options = SmolDoclingOptions(
artifacts_path="model_artifacts/SmolDocling_2.7_DT_0.7",
question="Perform Layout Analysis.",
load_in_8bit=True,
llm_int8_threshold=6.0,
quantized=False,
)
pipeline_options.artifacts_path = "model_artifacts"
pipeline_options.vlm_options = vlm_options
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS