mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Introduced SmolDoclingOptions to configure model parameters (such as query and artifacts path) via client code, see example in minimal_smol_docling. Provisioning for other potential vlm all-in-one models.
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
88b9ac6706
commit
f2751e11f9
@ -254,6 +254,14 @@ granite_picture_description = PictureDescriptionVlmOptions(
|
||||
)
|
||||
|
||||
|
||||
class SmolDoclingOptions(BaseModel):
|
||||
artifacts_path: str = ""
|
||||
question: str = "Perform Layout Analysis."
|
||||
load_in_8bit: bool = True
|
||||
llm_int8_threshold: float = 6.0
|
||||
quantized: bool = False
|
||||
|
||||
|
||||
# Define an enum for the backend options
|
||||
class PdfBackend(str, Enum):
|
||||
"""Enum of valid PDF backends."""
|
||||
@ -313,6 +321,8 @@ class PdfPipelineOptions(PipelineOptions):
|
||||
Field(discriminator="kind"),
|
||||
] = smolvlm_picture_description
|
||||
|
||||
vlm_options: Union[SmolDoclingOptions,] = Field(SmolDoclingOptions())
|
||||
|
||||
images_scale: float = 1.0
|
||||
generate_page_images: bool = False
|
||||
generate_picture_images: bool = False
|
||||
|
@ -13,7 +13,11 @@ from transformers import ( # type: ignore
|
||||
|
||||
from docling.datamodel.base_models import DocTagsPrediction, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
SmolDoclingOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
@ -24,17 +28,23 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
class SmolDoclingModel(BasePageModel):
|
||||
|
||||
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
|
||||
def __init__(
|
||||
self,
|
||||
artifacts_path: Path,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
vlm_options: SmolDoclingOptions,
|
||||
):
|
||||
device = decide_device(accelerator_options.device)
|
||||
self.device = device
|
||||
_log.info("Available device for SmolDocling: {}".format(device))
|
||||
|
||||
# PARAMETERS:
|
||||
self.param_question = "Perform Layout Analysis."
|
||||
self.param_question = vlm_options.question # "Perform Layout Analysis."
|
||||
self.param_quantization_config = BitsAndBytesConfig(
|
||||
load_in_8bit=True, llm_int8_threshold=6.0
|
||||
load_in_8bit=vlm_options.load_in_8bit, # True,
|
||||
llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0
|
||||
)
|
||||
self.param_quantized = False
|
||||
self.param_quantized = vlm_options.quantized # False
|
||||
|
||||
self.processor = AutoProcessor.from_pretrained(artifacts_path)
|
||||
if not self.param_quantized:
|
||||
|
@ -36,8 +36,6 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VlmPipeline(PaginatedPipeline):
|
||||
# _smol_vlm_path = "SmolDocling-0.0.2"
|
||||
_smol_vlm_path = "SmolDocling_2.7_DT_0.7"
|
||||
|
||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
@ -60,8 +58,9 @@ class VlmPipeline(PaginatedPipeline):
|
||||
|
||||
self.build_pipe = [
|
||||
SmolDoclingModel(
|
||||
artifacts_path=self.artifacts_path / VlmPipeline._smol_vlm_path,
|
||||
artifacts_path=self.artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
vlm_options=self.pipeline_options.vlm_options,
|
||||
),
|
||||
]
|
||||
|
||||
|
@ -8,24 +8,31 @@ import yaml
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, SmolDoclingOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
sources = [
|
||||
# "https://arxiv.org/pdf/2408.09869"
|
||||
# "https://arxiv.org/pdf/2408.09869",
|
||||
# "tests/data/2305.03393v1-pg9-img.png",
|
||||
"tests/data/2305.03393v1-pg9.pdf",
|
||||
# "demo_data/page.png",
|
||||
# "demo_data/original_tables.pdf",
|
||||
]
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.generate_page_images = True
|
||||
pipeline_options.force_backend_text = (
|
||||
False # If True, text from backend will be used instead of generated text
|
||||
# If force_backend_text = True, text from backend will be used instead of generated text
|
||||
pipeline_options.force_backend_text = False
|
||||
pipeline_options.artifacts_path = "model_artifacts/SmolDocling_2.7_DT_0.7"
|
||||
|
||||
vlm_options = SmolDoclingOptions(
|
||||
artifacts_path="model_artifacts/SmolDocling_2.7_DT_0.7",
|
||||
question="Perform Layout Analysis.",
|
||||
load_in_8bit=True,
|
||||
llm_int8_threshold=6.0,
|
||||
quantized=False,
|
||||
)
|
||||
pipeline_options.artifacts_path = "model_artifacts"
|
||||
|
||||
pipeline_options.vlm_options = vlm_options
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||
|
Loading…
Reference in New Issue
Block a user