From 853544ba1126f1b29e24ce5bba8b839b05919cc1 Mon Sep 17 00:00:00 2001
From: Maksym Lysak <mly@zurich.ibm.com>
Date: Thu, 13 Feb 2025 17:19:53 +0100
Subject: [PATCH] Addressing PR comments, added enabled property to
 SmolDocling, and related VLM pipeline option, few other minor things

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py     | 23 +++++-
 docling/models/smol_docling_model.py      | 98 +++++++++++++++--------
 docling/pipeline/base_pipeline.py         |  5 +-
 docling/pipeline/standard_pdf_pipeline.py |  1 -
 docling/pipeline/vlm_pipeline.py          | 19 +++--
 docs/examples/minimal_smol_docling.py     |  2 +-
 6 files changed, 102 insertions(+), 46 deletions(-)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 274d4438..b4c0766b 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -255,9 +255,7 @@ granite_picture_description = PictureDescriptionVlmOptions(
 
 
 class SmolDoclingOptions(BaseModel):
-    artifacts_path: str = ""
-    question: str = "Convert this page to docling."  # "Perform Layout Analysis."
-
+    question: str = "Convert this page to docling."
     load_in_8bit: bool = True
     llm_int8_threshold: float = 6.0
     quantized: bool = False
@@ -294,7 +292,24 @@ class PipelineOptions(BaseModel):
     enable_remote_services: bool = False
 
 
-class PdfPipelineOptions(PipelineOptions):
+class PaginatedPipelineOptions(PipelineOptions):
+    images_scale: float = 1.0
+    generate_page_images: bool = False
+    generate_picture_images: bool = False
+
+
+class VlmPipelineOptions(PaginatedPipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
+    do_vlm: bool = True  # True: perform inference of Visual Language Model
+
+    force_backend_text: bool = (
+        False  # (To be used with vlms, or other generative models)
+    )
+    # If True, text from backend will be used instead of generated text
+    vlm_options: Union[SmolDoclingOptions,] = Field(SmolDoclingOptions())
+
+
+class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""
 
     artifacts_path: Optional[Union[Path, str]] = None
diff --git a/docling/models/smol_docling_model.py b/docling/models/smol_docling_model.py
index 6669de05..00c04fa7 100644
--- a/docling/models/smol_docling_model.py
+++ b/docling/models/smol_docling_model.py
@@ -3,14 +3,6 @@ import time
 from pathlib import Path
 from typing import Iterable, List, Optional
 
-import torch
-from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
-from transformers import (  # type: ignore
-    AutoProcessor,
-    BitsAndBytesConfig,
-    Idefics3ForConditionalGeneration,
-)
-
 from docling.datamodel.base_models import DocTagsPrediction, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
@@ -32,38 +24,76 @@ class SmolDoclingModel(BasePageModel):
 
     def __init__(
         self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
         accelerator_options: AcceleratorOptions,
         vlm_options: SmolDoclingOptions,
     ):
-        device = decide_device(accelerator_options.device)
-        self.device = device
-        _log.info("Available device for SmolDocling: {}".format(device))
+        self.enabled = enabled
 
-        # PARAMETERS:
-        artifacts_path = Path(vlm_options.artifacts_path)
-        self.param_question = vlm_options.question  # "Perform Layout Analysis."
-        self.param_quantization_config = BitsAndBytesConfig(
-            load_in_8bit=vlm_options.load_in_8bit,  # True,
-            llm_int8_threshold=vlm_options.llm_int8_threshold,  # 6.0
+        if self.enabled:
+            import torch
+            from transformers import (  # type: ignore
+                AutoProcessor,
+                BitsAndBytesConfig,
+                Idefics3ForConditionalGeneration,
+            )
+
+            device = decide_device(accelerator_options.device)
+            self.device = device
+
+            _log.debug("Available device for SmolDocling: {}".format(device))
+
+            repo_cache_folder = self._repo_id.replace("/", "--")
+
+            # PARAMETERS:
+            if artifacts_path is None:
+                artifacts_path = self.download_models()
+            elif (artifacts_path / repo_cache_folder).exists():
+                artifacts_path = artifacts_path / repo_cache_folder
+
+            self.param_question = vlm_options.question  # "Perform Layout Analysis."
+            self.param_quantization_config = BitsAndBytesConfig(
+                load_in_8bit=vlm_options.load_in_8bit,  # True,
+                llm_int8_threshold=vlm_options.llm_int8_threshold,  # 6.0
+            )
+            self.param_quantized = vlm_options.quantized  # False
+
+            self.processor = AutoProcessor.from_pretrained(artifacts_path)
+            if not self.param_quantized:
+                self.vlm_model = Idefics3ForConditionalGeneration.from_pretrained(
+                    artifacts_path,
+                    # device_map=device,
+                    torch_dtype=torch.bfloat16,
+                )
+                self.vlm_model = self.vlm_model.to(device)
+            else:
+                self.vlm_model = Idefics3ForConditionalGeneration.from_pretrained(
+                    artifacts_path,
+                    # device_map=device,
+                    torch_dtype="auto",
+                    quantization_config=self.param_quantization_config,
+                ).to(device)
+
+    @staticmethod
+    def download_models(
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id=SmolDoclingModel._repo_id,
+            force_download=force,
+            local_dir=local_dir,
+            # revision="v0.0.1",
         )
-        self.param_quantized = vlm_options.quantized  # False
 
-        self.processor = AutoProcessor.from_pretrained(artifacts_path)
-        if not self.param_quantized:
-            self.vlm_model = Idefics3ForConditionalGeneration.from_pretrained(
-                artifacts_path,
-                device_map=device,
-                torch_dtype=torch.bfloat16,
-                # _attn_implementation="flash_attention_2",
-            )
-            self.vlm_model = self.vlm_model.to(device)
-        else:
-            self.vlm_model = Idefics3ForConditionalGeneration.from_pretrained(
-                artifacts_path,
-                device_map=device,
-                torch_dtype="auto",
-                quantization_config=self.param_quantization_config,
-            )
+        return Path(download_path)
 
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py
index 01ed71a0..d08cf85a 100644
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@@ -116,7 +116,10 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
 
     def __init__(self, pipeline_options: PipelineOptions):
         super().__init__(pipeline_options)
-        self.keep_backend = True
+        self.keep_backend = (
+            True  # For now, need to be able to query for page size post prediction
+        )
+        # self.keep_backend = False
 
     def _apply_on_pages(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index 1278c9d1..7df8f15b 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -56,7 +56,6 @@ class StandardPdfPipeline(PaginatedPipeline):
 
     def __init__(self, pipeline_options: PdfPipelineOptions):
         super().__init__(pipeline_options)
-        print("------> Init Standard PDF Pipeline!")
         self.pipeline_options: PdfPipelineOptions
 
         artifacts_path: Optional[Path] = None
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 5cb6bf84..04ae40d9 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -97,6 +97,8 @@ class VlmPipeline(PaginatedPipeline):
 
         self.build_pipe = [
             SmolDoclingModel(
+                enabled=pipeline_options.do_vlm,
+                artifacts_path=artifacts_path,
                 accelerator_options=pipeline_options.accelerator_options,
                 vlm_options=self.pipeline_options.vlm_options,
             ),
@@ -297,6 +299,7 @@ class VlmPipeline(PaginatedPipeline):
                 token
                 for token in tokens
                 if not (token.startswith("<loc_") or token in ["<otsl>", "</otsl>"])
+                # if not (token.startswith(DocumentToken.BEG_LOC) or token in [DocumentToken.BEG_OTSL, DocumentToken.END_OTSL])
             ]
             # Split the string by those tokens to get the in-between text
             text_parts = re.split(pattern, s)
@@ -304,6 +307,7 @@ class VlmPipeline(PaginatedPipeline):
                 token
                 for token in text_parts
                 if not (token.startswith("<loc_") or token in ["<otsl>", "</otsl>"])
+                # if not (token.startswith(DocumentToken.BEG_LOC) or token in [DocumentToken.BEG_OTSL, DocumentToken.END_OTSL])
             ]
             # Remove any empty or purely whitespace strings from text_parts
             text_parts = [part for part in text_parts if part.strip()]
@@ -347,10 +351,15 @@ class VlmPipeline(PaginatedPipeline):
 
             # Regex for all recognized tags
             tag_pattern = (
-                r"<(?P<tag>title|document_index|otsl|section_header_level_1|checkbox_selected|"
-                r"checkbox_unselected|text|page_header|page_footer|formula|caption|picture|"
-                r"list_item|footnote|code)>.*?</(?P=tag)>"
+                rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
+                rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
+                rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
+                rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
+                rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
+                rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
+                rf"{DocItemLabel.SECTION_HEADER}_level_1|otsl)>.*?</(?P=tag)>"
             )
+
             pattern = re.compile(tag_pattern, re.DOTALL)
 
             # Go through each match in order
@@ -438,8 +447,8 @@ class VlmPipeline(PaginatedPipeline):
         return doc
 
     @classmethod
-    def get_default_options(cls) -> PdfPipelineOptions:
-        return PdfPipelineOptions()
+    def get_default_options(cls) -> VlmPipelineOptions:
+        return VlmPipelineOptions()
 
     @classmethod
     def is_backend_supported(cls, backend: AbstractDocumentBackend):
diff --git a/docs/examples/minimal_smol_docling.py b/docs/examples/minimal_smol_docling.py
index 5d64dee4..66252f7b 100644
--- a/docs/examples/minimal_smol_docling.py
+++ b/docs/examples/minimal_smol_docling.py
@@ -19,7 +19,7 @@ pipeline_options = VlmPipelineOptions()  # artifacts_path="~/local_model_artifac
 pipeline_options.generate_page_images = True
 # If force_backend_text = True, text from backend will be used instead of generated text
 pipeline_options.force_backend_text = False
-
+# pipeline_options.do_vlm = True - use False to disable VLM model (i.e. SmallDocling), extra python imports will not be performed
 
 vlm_options = SmolDoclingOptions(
     # question="Convert this page to docling.",