From 853544ba1126f1b29e24ce5bba8b839b05919cc1 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Thu, 13 Feb 2025 17:19:53 +0100 Subject: [PATCH] Addressing PR comments, added enabled property to SmolDocling, and related VLM pipeline option, few other minor things Signed-off-by: Maksym Lysak --- docling/datamodel/pipeline_options.py | 23 +++++- docling/models/smol_docling_model.py | 98 +++++++++++++++-------- docling/pipeline/base_pipeline.py | 5 +- docling/pipeline/standard_pdf_pipeline.py | 1 - docling/pipeline/vlm_pipeline.py | 19 +++-- docs/examples/minimal_smol_docling.py | 2 +- 6 files changed, 102 insertions(+), 46 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 274d4438..b4c0766b 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -255,9 +255,7 @@ granite_picture_description = PictureDescriptionVlmOptions( class SmolDoclingOptions(BaseModel): - artifacts_path: str = "" - question: str = "Convert this page to docling." # "Perform Layout Analysis." - + question: str = "Convert this page to docling." load_in_8bit: bool = True llm_int8_threshold: float = 6.0 quantized: bool = False @@ -294,7 +292,24 @@ class PipelineOptions(BaseModel): enable_remote_services: bool = False -class PdfPipelineOptions(PipelineOptions): +class PaginatedPipelineOptions(PipelineOptions): + images_scale: float = 1.0 + generate_page_images: bool = False + generate_picture_images: bool = False + + +class VlmPipelineOptions(PaginatedPipelineOptions): + artifacts_path: Optional[Union[Path, str]] = None + do_vlm: bool = True # True: perform inference of Visual Language Model + + force_backend_text: bool = ( + False # (To be used with vlms, or other generative models) + ) + # If True, text from backend will be used instead of generated text + vlm_options: Union[SmolDoclingOptions,] = Field(SmolDoclingOptions()) + + +class PdfPipelineOptions(PaginatedPipelineOptions): """Options for the PDF pipeline.""" artifacts_path: Optional[Union[Path, str]] = None diff --git a/docling/models/smol_docling_model.py b/docling/models/smol_docling_model.py index 6669de05..00c04fa7 100644 --- a/docling/models/smol_docling_model.py +++ b/docling/models/smol_docling_model.py @@ -3,14 +3,6 @@ import time from pathlib import Path from typing import Iterable, List, Optional -import torch -from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS -from transformers import ( # type: ignore - AutoProcessor, - BitsAndBytesConfig, - Idefics3ForConditionalGeneration, -) - from docling.datamodel.base_models import DocTagsPrediction, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( @@ -32,38 +24,76 @@ class SmolDoclingModel(BasePageModel): def __init__( self, + enabled: bool, + artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions, vlm_options: SmolDoclingOptions, ): - device = decide_device(accelerator_options.device) - self.device = device - _log.info("Available device for SmolDocling: {}".format(device)) + self.enabled = enabled - # PARAMETERS: - artifacts_path = Path(vlm_options.artifacts_path) - self.param_question = vlm_options.question # "Perform Layout Analysis." - self.param_quantization_config = BitsAndBytesConfig( - load_in_8bit=vlm_options.load_in_8bit, # True, - llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0 + if self.enabled: + import torch + from transformers import ( # type: ignore + AutoProcessor, + BitsAndBytesConfig, + Idefics3ForConditionalGeneration, + ) + + device = decide_device(accelerator_options.device) + self.device = device + + _log.debug("Available device for SmolDocling: {}".format(device)) + + repo_cache_folder = self._repo_id.replace("/", "--") + + # PARAMETERS: + if artifacts_path is None: + artifacts_path = self.download_models() + elif (artifacts_path / repo_cache_folder).exists(): + artifacts_path = artifacts_path / repo_cache_folder + + self.param_question = vlm_options.question # "Perform Layout Analysis." + self.param_quantization_config = BitsAndBytesConfig( + load_in_8bit=vlm_options.load_in_8bit, # True, + llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0 + ) + self.param_quantized = vlm_options.quantized # False + + self.processor = AutoProcessor.from_pretrained(artifacts_path) + if not self.param_quantized: + self.vlm_model = Idefics3ForConditionalGeneration.from_pretrained( + artifacts_path, + # device_map=device, + torch_dtype=torch.bfloat16, + ) + self.vlm_model = self.vlm_model.to(device) + else: + self.vlm_model = Idefics3ForConditionalGeneration.from_pretrained( + artifacts_path, + # device_map=device, + torch_dtype="auto", + quantization_config=self.param_quantization_config, + ).to(device) + + @staticmethod + def download_models( + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + ) -> Path: + from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars + + if not progress: + disable_progress_bars() + download_path = snapshot_download( + repo_id=SmolDoclingModel._repo_id, + force_download=force, + local_dir=local_dir, + # revision="v0.0.1", ) - self.param_quantized = vlm_options.quantized # False - self.processor = AutoProcessor.from_pretrained(artifacts_path) - if not self.param_quantized: - self.vlm_model = Idefics3ForConditionalGeneration.from_pretrained( - artifacts_path, - device_map=device, - torch_dtype=torch.bfloat16, - # _attn_implementation="flash_attention_2", - ) - self.vlm_model = self.vlm_model.to(device) - else: - self.vlm_model = Idefics3ForConditionalGeneration.from_pretrained( - artifacts_path, - device_map=device, - torch_dtype="auto", - quantization_config=self.param_quantization_config, - ) + return Path(download_path) def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 01ed71a0..d08cf85a 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -116,7 +116,10 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. def __init__(self, pipeline_options: PipelineOptions): super().__init__(pipeline_options) - self.keep_backend = True + self.keep_backend = ( + True # For now, need to be able to query for page size post prediction + ) + # self.keep_backend = False def _apply_on_pages( self, conv_res: ConversionResult, page_batch: Iterable[Page] diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 1278c9d1..7df8f15b 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -56,7 +56,6 @@ class StandardPdfPipeline(PaginatedPipeline): def __init__(self, pipeline_options: PdfPipelineOptions): super().__init__(pipeline_options) - print("------> Init Standard PDF Pipeline!") self.pipeline_options: PdfPipelineOptions artifacts_path: Optional[Path] = None diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 5cb6bf84..04ae40d9 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -97,6 +97,8 @@ class VlmPipeline(PaginatedPipeline): self.build_pipe = [ SmolDoclingModel( + enabled=pipeline_options.do_vlm, + artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, vlm_options=self.pipeline_options.vlm_options, ), @@ -297,6 +299,7 @@ class VlmPipeline(PaginatedPipeline): token for token in tokens if not (token.startswith("", ""]) + # if not (token.startswith(DocumentToken.BEG_LOC) or token in [DocumentToken.BEG_OTSL, DocumentToken.END_OTSL]) ] # Split the string by those tokens to get the in-between text text_parts = re.split(pattern, s) @@ -304,6 +307,7 @@ class VlmPipeline(PaginatedPipeline): token for token in text_parts if not (token.startswith("", ""]) + # if not (token.startswith(DocumentToken.BEG_LOC) or token in [DocumentToken.BEG_OTSL, DocumentToken.END_OTSL]) ] # Remove any empty or purely whitespace strings from text_parts text_parts = [part for part in text_parts if part.strip()] @@ -347,10 +351,15 @@ class VlmPipeline(PaginatedPipeline): # Regex for all recognized tags tag_pattern = ( - r"<(?Ptitle|document_index|otsl|section_header_level_1|checkbox_selected|" - r"checkbox_unselected|text|page_header|page_footer|formula|caption|picture|" - r"list_item|footnote|code)>.*?" + rf"<(?P{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|" + rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|" + rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|" + rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|" + rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|" + rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|" + rf"{DocItemLabel.SECTION_HEADER}_level_1|otsl)>.*?" ) + pattern = re.compile(tag_pattern, re.DOTALL) # Go through each match in order @@ -438,8 +447,8 @@ class VlmPipeline(PaginatedPipeline): return doc @classmethod - def get_default_options(cls) -> PdfPipelineOptions: - return PdfPipelineOptions() + def get_default_options(cls) -> VlmPipelineOptions: + return VlmPipelineOptions() @classmethod def is_backend_supported(cls, backend: AbstractDocumentBackend): diff --git a/docs/examples/minimal_smol_docling.py b/docs/examples/minimal_smol_docling.py index 5d64dee4..66252f7b 100644 --- a/docs/examples/minimal_smol_docling.py +++ b/docs/examples/minimal_smol_docling.py @@ -19,7 +19,7 @@ pipeline_options = VlmPipelineOptions() # artifacts_path="~/local_model_artifac pipeline_options.generate_page_images = True # If force_backend_text = True, text from backend will be used instead of generated text pipeline_options.force_backend_text = False - +# pipeline_options.do_vlm = True - use False to disable VLM model (i.e. SmallDocling), extra python imports will not be performed vlm_options = SmolDoclingOptions( # question="Convert this page to docling.",