From 0c7c7c11c27e599dd088ca598f83810ffb3ef551 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Fri, 16 May 2025 16:31:11 +0200
Subject: [PATCH] reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 .../pipeline_model_specializations.py         |  4 +-
 docling/document_converter.py                 |  8 +-
 docling/models/hf_vlm_model.py                |  3 +-
 .../models/hf_vlm_models/hf_vlm_mlx_model.py  | 10 +--
 .../hf_vlm_model_AutoModelForCausalLM.py      | 12 +--
 .../hf_vlm_model_AutoModelForVision2Seq.py    |  6 +-
 ...vlm_model_LlavaForConditionalGeneration.py | 10 +--
 docling/pipeline/vlm_pipeline.py              | 76 ++++++++++---------
 docs/examples/minimal_vlm_pipeline.py         | 52 +++++++------
 9 files changed, 96 insertions(+), 85 deletions(-)

diff --git a/docling/datamodel/pipeline_model_specializations.py b/docling/datamodel/pipeline_model_specializations.py
index 77e6c2f2..12ebcb46 100644
--- a/docling/datamodel/pipeline_model_specializations.py
+++ b/docling/datamodel/pipeline_model_specializations.py
@@ -44,11 +44,11 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
     inference_framework: InferenceFramework
     response_format: ResponseFormat
 
-    scale: float = 2.0 
+    scale: float = 2.0
 
     temperature: float = 0.0
     stop_strings: list[str] = []
-    
+
     use_kv_cache: bool = True
     max_new_tokens: int = 4096
 
diff --git a/docling/document_converter.py b/docling/document_converter.py
index 25e6444e..e553c083 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -186,11 +186,11 @@ class DocumentConverter:
             Tuple[Type[BasePipeline], str], BasePipeline
         ] = {}
 
-    def _get_initialized_pipelines(self) -> dict[
-            tuple[Type[BasePipeline], str], BasePipeline
-        ]:
+    def _get_initialized_pipelines(
+        self,
+    ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
         return self.initialized_pipelines
-        
+
     def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
         """Generate a hash of pipeline options to use as part of the cache key."""
         options_str = str(pipeline_options.model_dump())
diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py
index 73e6f313..e82a34d0 100644
--- a/docling/models/hf_vlm_model.py
+++ b/docling/models/hf_vlm_model.py
@@ -6,7 +6,6 @@ _log = logging.getLogger(__name__)
 
 
 class HuggingFaceVlmModel:
-
     @staticmethod
     def map_device_to_cpu_if_mlx(device: str) -> str:
         if device == "mps":
@@ -16,7 +15,7 @@ class HuggingFaceVlmModel:
             return "cpu"
 
         return device
-        
+
     @staticmethod
     def download_models(
         repo_id: str,
diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
index 57abaa7e..4e724191 100644
--- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
+++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
@@ -30,7 +30,7 @@ class HuggingFaceMlxModel(BasePageModel):
         self.vlm_options = vlm_options
         self.max_tokens = vlm_options.max_new_tokens
         self.temperature = vlm_options.temperature
-        
+
         if self.enabled:
             try:
                 from mlx_vlm import generate, load  # type: ignore
@@ -76,8 +76,6 @@ class HuggingFaceMlxModel(BasePageModel):
                     assert page.size is not None
 
                     hi_res_image = page.get_image(scale=self.vlm_options.scale)
-                    hi_res_image.save("./scratch/page.png")
-                    
                     if hi_res_image is not None:
                         im_width, im_height = hi_res_image.size
 
@@ -128,8 +126,10 @@ class HuggingFaceMlxModel(BasePageModel):
                                 )
                             )
                         else:
-                            _log.warning(f"incompatible shape for logprobs: {token.logprobs.shape}")
-                            
+                            _log.warning(
+                                f"incompatible shape for logprobs: {token.logprobs.shape}"
+                            )
+
                         output += token.text
                         if "</doctag>" in token.text:
                             break
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
index 213a5a28..e0c09c88 100644
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
@@ -42,9 +42,9 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
             )
 
             self.device = decide_device(accelerator_options.device)
-            self.device = HuggingFaceVlmMode.map_device_to_cpu_if_mlx(self.device)
+            self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
             _log.debug(f"Available device for VLM: {self.device}")
-            
+
             self.use_cache = vlm_options.use_kv_cache
             self.max_new_tokens = vlm_options.max_new_tokens
             self.temperature = vlm_options.temperature
@@ -120,14 +120,14 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
 
                     if hi_res_image is not None:
                         im_width, im_height = hi_res_image.size
-                    
+
                     # Define prompt structure
                     prompt = self.formulate_prompt()
                     print(f"prompt: '{prompt}', size: {im_width}, {im_height}")
 
                     inputs = self.processor(
                         text=prompt, images=hi_res_image, return_tensors="pt"
-                    ) #.to(self.device)
+                    )  # .to(self.device)
 
                     # Generate response
                     start_time = time.time()
@@ -153,7 +153,9 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                     _log.debug(
                         f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                     )
-                    page.predictions.vlm_response = VlmPrediction(text=response, generation_time=generation_time)
+                    page.predictions.vlm_response = VlmPrediction(
+                        text=response, generation_time=generation_time
+                    )
 
                 yield page
 
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
index b0c74aa8..69154d77 100644
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
@@ -39,14 +39,14 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
             )
 
             self.device = decide_device(accelerator_options.device)
-            self.device = HuggingFaceVlmMode.map_device_to_cpu_if_mlx(self.device)
+            self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
 
             _log.debug(f"Available device for HuggingFace VLM: {self.device}")
 
             self.use_cache = vlm_options.use_kv_cache
             self.max_new_tokens = vlm_options.max_new_tokens
             self.temperature = vlm_options.temperature
-            
+
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
 
             # PARAMETERS:
@@ -122,7 +122,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
                     """
-                    
+
                     # Define prompt structure
                     prompt = self.formulate_prompt()
 
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
index 1c286a8b..cd708b89 100644
--- a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
@@ -39,12 +39,12 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
             )
 
             self.device = decide_device(accelerator_options.device)
-            self.device = HuggingFaceVlmMode.map_device_to_cpu_if_mlx(self.device)
+            self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
 
             self.use_cache = vlm_options.use_kv_cache
             self.max_new_tokens = vlm_options.max_new_tokens
             self.temperature = vlm_options.temperature
-            
+
             _log.debug(f"Available device for VLM: {self.device}")
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
 
@@ -94,7 +94,7 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
                     """
-                    
+
                     images = [hi_res_image]
 
                     # Define prompt structure
@@ -113,7 +113,7 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
                         temperature=self.temperature,
                     )
 
-                    #num_tokens = len(generate_ids[0])
+                    # num_tokens = len(generate_ids[0])
                     generation_time = time.time() - start_time
 
                     response = self.processor.batch_decode(
@@ -124,7 +124,7 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
 
                     page.predictions.vlm_response = VlmPrediction(
                         text=response,
-                        #generated_tokens=num_tokens,
+                        # generated_tokens=num_tokens,
                         generation_time=generation_time,
                     )
 
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index b902bb2e..e9abae6d 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -1,11 +1,23 @@
-import re
 import logging
+import re
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
 
 # from docling_core.types import DoclingDocument
-from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItem,
+    DoclingDocument,
+    ImageRef,
+    PictureItem,
+    ProvenanceItem,
+    TextItem,
+)
+from docling_core.types.doc.base import (
+    BoundingBox,
+    Size,
+)
 from docling_core.types.doc.document import DocTagsDocument
 from PIL import Image as PILImage
 
@@ -20,14 +32,6 @@ from docling.datamodel.pipeline_model_specializations import (
     InferenceFramework,
     ResponseFormat,
 )
-from docling_core.types.doc.base import (
-    Size,
-    BoundingBox,    
-)
-from docling_core.types.doc import (
-    ProvenanceItem,
-    DoclingDocument
-)
 from docling.datamodel.pipeline_options import (
     VlmPipelineOptions,
 )
@@ -168,6 +172,7 @@ class VlmPipeline(PaginatedPipeline):
                 self.pipeline_options.vlm_options.response_format
                 == ResponseFormat.DOCTAGS
             ):
+                """
                 doctags_list = []
                 image_list = []
                 for page in conv_res.pages:
@@ -207,6 +212,9 @@ class VlmPipeline(PaginatedPipeline):
                             txt = self.extract_text_from_backend(page, crop_bbox)
                             element.text = txt
                             element.orig = txt
+                """
+                conv_res.document = self._turn_dt_into_doc(conv_res)
+
             elif (
                 self.pipeline_options.vlm_options.response_format
                 == ResponseFormat.MARKDOWN
@@ -271,21 +279,18 @@ class VlmPipeline(PaginatedPipeline):
             if self.force_backend_text:
                 scale = self.pipeline_options.images_scale
                 for element, _level in conv_res.document.iterate_items():
-                    if (not isinstance(element, TextItem)
-                        or len(element.prov) == 0
-                    ):
+                    if not isinstance(element, TextItem) or len(element.prov) == 0:
                         continue
                     crop_bbox = (
                         element.prov[0]
                         .bbox.scaled(scale=scale)
-                        .to_top_left_origin(
-                            page_height=page.size.height * scale
-                        )
+                        .to_top_left_origin(page_height=page.size.height * scale)
                     )
                     txt = self.extract_text_from_backend(page, crop_bbox)
                     element.text = txt
                     element.orig = txt
-        
+
+        return conv_res.document
 
     """
     def _turn_md_into_doc(self, conv_res):
@@ -308,45 +313,40 @@ class VlmPipeline(PaginatedPipeline):
     """
 
     def _turn_md_into_doc(self, conv_res):
-
         def _extract_markdown_code(text):
             """
             Extracts text from markdown code blocks (enclosed in triple backticks).
             If no code blocks are found, returns the original text.
-            
+
             Args:
                 text (str): Input text that may contain markdown code blocks
-            
+
             Returns:
                 str: Extracted code if code blocks exist, otherwise original text
             """
             # Regex pattern to match content between triple backticks
             # This handles multiline content and optional language specifier
-            pattern = r'^```(?:\w*\n)?(.*?)```(\n)*$'
-    
-            # Search for matches with DOTALL flag to match across multiple lines
-            matches = re.findall(pattern, text, re.DOTALL)
+            pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
 
             # Search with DOTALL flag to match across multiple lines
             mtch = re.search(pattern, text, re.DOTALL)
-             
+
             if mtch:
                 # Return only the content of the first capturing group
                 return mtch.group(1)
             else:
                 # No code blocks found, return original text
                 return text
-        
-        for pg_idx, page in enumerate(conv_res.pages):
 
-            page_no = pg_idx+1 # FIXME: might be incorrect
-            
+        for pg_idx, page in enumerate(conv_res.pages):
+            page_no = pg_idx + 1  # FIXME: might be incorrect
+
             predicted_text = ""
             if page.predictions.vlm_response:
                 predicted_text = page.predictions.vlm_response.text + "\n\n"
 
             predicted_text = _extract_markdown_code(text=predicted_text)
-                
+
             response_bytes = BytesIO(predicted_text.encode("utf8"))
             out_doc = InputDocument(
                 path_or_stream=response_bytes,
@@ -370,20 +370,24 @@ class VlmPipeline(PaginatedPipeline):
             conv_res.document.add_page(
                 page_no=page_no,
                 size=Size(width=pg_width, height=pg_height),
-                image=ImageRef.from_pil(image=page.image, dpi=72) if page.image else None,
+                image=ImageRef.from_pil(image=page.image, dpi=72)
+                if page.image
+                else None,
             )
-                
+
             for item, level in page_doc.iterate_items():
                 item.prov = [
-                    ProvenanceItem(page_no=pg_idx+1,
-                                   bbox=BoundingBox(t=0.0, b=0.0, l=0.0, r=0.0),
-                                   charspan=[0,0])
+                    ProvenanceItem(
+                        page_no=pg_idx + 1,
+                        bbox=BoundingBox(t=0.0, b=0.0, l=0.0, r=0.0),
+                        charspan=[0, 0],
+                    )
                 ]
                 conv_res.document.append_child_item(child=item)
                 print(item)
 
         return conv_res.document
-    
+
     @classmethod
     def get_default_options(cls) -> VlmPipelineOptions:
         return VlmPipelineOptions()
diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py
index be2afe06..fa79fcc9 100644
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -4,6 +4,7 @@ from pathlib import Path
 
 from docling_core.types.doc import DocItemLabel, ImageRefMode
 from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
+from tabulate import tabulate
 
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_model_specializations import (
@@ -25,8 +26,6 @@ from docling.datamodel.pipeline_options import (
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
 
-from tabulate import tabulate
-
 ## Use experimental VlmPipeline
 pipeline_options = VlmPipelineOptions()
 # If force_backend_text = True, text from backend will be used instead of generated text
@@ -101,19 +100,20 @@ qwen_vlm_conversion_options = HuggingFaceVlmOptions(
 pipeline_options.vlm_options = qwen_vlm_conversion_options
 """
 
+
 def convert(sources: list[Path], converter):
     for source in sources:
-        #start_time = time.time()
+        # start_time = time.time()
         print("================================================")
         print(f"Processing... {source}")
         print("================================================")
         print("")
 
         res = converter.convert(source)
-        
+
         print("")
         # print(res.document.export_to_markdown())
-        
+
         model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
         framework = pipeline_options.vlm_options.inference_framework
         fname = f"{res.input.file.stem}-{model_id}-{framework}"
@@ -127,7 +127,7 @@ def convert(sources: list[Path], converter):
             )
             print(page.predictions.vlm_response.text)
             print(" ---------- ")
-            
+
         print("===== Final output of the converted document =======")
 
         with (out_path / f"{fname}.json").open("w") as fp:
@@ -152,7 +152,7 @@ def convert(sources: list[Path], converter):
             split_page_view=True,
         )
         print(f" => produced {out_path / fname}.html")
-        
+
         pg_num = res.document.num_pages()
         print("")
         print(
@@ -161,18 +161,24 @@ def convert(sources: list[Path], converter):
         print("====================================================")
 
         # return [source, f"{out_path / fname}.html", model_id, framework, inference_time, ]
-        return [source, model_id, framework, pg_num, inference_time, ]
-        
-if __name__ == "__main__":
+        return [
+            source,
+            model_id,
+            framework,
+            pg_num,
+            inference_time,
+        ]
 
+
+if __name__ == "__main__":
     sources = [
         # "tests/data/2305.03393v1-pg9-img.png",
         "tests/data/pdf/2305.03393v1-pg9.pdf",
     ]
-    
+
     out_path = Path("scratch")
     out_path.mkdir(parents=True, exist_ok=True)
-    
+
     ## Use VlmPipeline
     pipeline_options = VlmPipelineOptions()
 
@@ -186,16 +192,16 @@ if __name__ == "__main__":
 
     rows = []
     for vlm_options in [
-            # smoldocling_vlm_conversion_options, \
-            smoldocling_vlm_mlx_conversion_options, \
-            # granite_vision_vlm_conversion_options, \
-            # phi_vlm_conversion_options, \
-            # qwen25_vl_3b_vlm_mlx_conversion_options, \
-            # pixtral_12b_vlm_mlx_conversion_options,
-            # pixtral_12b_vlm_conversion_options,
+        # smoldocling_vlm_conversion_options, \
+        smoldocling_vlm_mlx_conversion_options,
+        # granite_vision_vlm_conversion_options, \
+        # phi_vlm_conversion_options, \
+        # qwen25_vl_3b_vlm_mlx_conversion_options, \
+        # pixtral_12b_vlm_mlx_conversion_options,
+        # pixtral_12b_vlm_conversion_options,
     ]:
         pipeline_options.vlm_options = vlm_options
-        
+
         ## Set up pipeline for PDF or image inputs
         converter = DocumentConverter(
             format_options={
@@ -209,12 +215,12 @@ if __name__ == "__main__":
                 ),
             },
         )
-        
+
         row = convert(sources=sources, converter=converter)
         print("pipelines: \n", converter._get_initialized_pipelines())
-        
+
         rows.append(row)
-        
+
         print(tabulate(rows))
 
         print("see if memory gets released ...")