From 3407955a47c48861a6e4610b355660777f8d8c47 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 13 May 2025 18:23:55 +0200 Subject: [PATCH] all working, now serious refacgtoring necessary Signed-off-by: Peter Staar --- docling/datamodel/pipeline_options.py | 1 + docling/models/hf_vlm_model.py | 3 +- .../hf_vlm_mlx_model.py} | 0 .../hf_vlm_model_AutoModelForCausalLM.py | 6 +- ...vlm_model_LlavaForConditionalGeneration.py | 141 ++++++++++++++++++ docling/pipeline/vlm_pipeline.py | 24 ++- docs/examples/minimal_vlm_pipeline.py | 48 +++++- 7 files changed, 202 insertions(+), 21 deletions(-) rename docling/models/{hf_mlx_model.py => hf_vlm_models/hf_vlm_mlx_model.py} (100%) create mode 100644 docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 3ee3702b..f6d127ca 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -269,6 +269,7 @@ class InferenceFramework(str, Enum): OPENAI = "openai" TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq" TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM" + TRANSFORMERS_LlavaForConditionalGeneration = "transformers-LlavaForConditionalGeneration" class HuggingFaceVlmOptions(BaseVlmOptions): diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py index 64023545..79518f0f 100644 --- a/docling/models/hf_vlm_model.py +++ b/docling/models/hf_vlm_model.py @@ -18,7 +18,6 @@ _log = logging.getLogger(__name__) class HuggingFaceVlmModel(BasePageModel): - """ def __init__( self, @@ -92,7 +91,7 @@ class HuggingFaceVlmModel(BasePageModel): # trust_remote_code=True, ) # .to(self.device) """ - + @staticmethod def download_models( repo_id: str, diff --git a/docling/models/hf_mlx_model.py b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py similarity index 100% rename from docling/models/hf_mlx_model.py rename to docling/models/hf_vlm_models/hf_vlm_mlx_model.py diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py index 692b77e6..5cfe2006 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py @@ -42,7 +42,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): ) self.device = decide_device(accelerator_options.device) - self.device = "cpu" # FIXME + self.device = "cpu" # FIXME _log.debug(f"Available device for VLM: {self.device}") repo_cache_folder = vlm_options.repo_id.replace("/", "--") @@ -165,7 +165,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): num_tokens = len(generate_ids[0]) generation_time = time.time() - start_time - + response = self.processor.batch_decode( generate_ids, skip_special_tokens=True, @@ -175,7 +175,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): _log.debug( f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." ) - + # inference_time = time.time() - start_time # tokens_per_second = num_tokens / generation_time # print("") diff --git a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py new file mode 100644 index 00000000..897ec9f6 --- /dev/null +++ b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py @@ -0,0 +1,141 @@ +import logging +import time +from collections.abc import Iterable +from pathlib import Path +from typing import Optional + +from docling.datamodel.base_models import Page, VlmPrediction +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import ( + AcceleratorOptions, + HuggingFaceVlmOptions, +) +from docling.models.base_model import BasePageModel +from docling.utils.accelerator_utils import decide_device +from docling.utils.profiling import TimeRecorder + +from transformers import AutoProcessor, LlavaForConditionalGeneration + +_log = logging.getLogger(__name__) + + +class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): + def __init__( + self, + enabled: bool, + artifacts_path: Optional[Path], + accelerator_options: AcceleratorOptions, + vlm_options: HuggingFaceVlmOptions, + ): + self.enabled = enabled + + self.trust_remote_code = True + + self.vlm_options = vlm_options + print(self.vlm_options) + + if self.enabled: + import torch + from transformers import ( # type: ignore + LlavaForConditionalGeneration, + AutoProcessor, + ) + + self.device = decide_device(accelerator_options.device) + self.device = "cpu" # FIXME + + torch.set_num_threads(12) # Adjust the number as needed + + _log.debug(f"Available device for VLM: {self.device}") + repo_cache_folder = vlm_options.repo_id.replace("/", "--") + + # PARAMETERS: + if artifacts_path is None: + artifacts_path = self.download_models(self.vlm_options.repo_id) + elif (artifacts_path / repo_cache_folder).exists(): + artifacts_path = artifacts_path / repo_cache_folder + + model_path = artifacts_path + print(f"model: {model_path}") + + self.max_new_tokens = 64 # FIXME + + self.processor = AutoProcessor.from_pretrained( + artifacts_path, + trust_remote_code=self.trust_remote_code, + ) + self.vlm_model = LlavaForConditionalGeneration.from_pretrained(artifacts_path).to(self.device) + + + @staticmethod + def download_models( + repo_id: str, + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + ) -> Path: + from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars + + if not progress: + disable_progress_bars() + download_path = snapshot_download( + repo_id=repo_id, + force_download=force, + local_dir=local_dir, + # revision="v0.0.1", + ) + + return Path(download_path) + + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + for page in page_batch: + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + with TimeRecorder(conv_res, "vlm"): + assert page.size is not None + + hi_res_image = page.get_image(scale=2.0) # 144dpi + # hi_res_image = page.get_image(scale=1.0) # 72dpi + + if hi_res_image is not None: + im_width, im_height = hi_res_image.size + + if hi_res_image: + if hi_res_image.mode != "RGB": + hi_res_image = hi_res_image.convert("RGB") + + images = [ + hi_res_image + ] + prompt = "[INST]Describe the images.\n[IMG][/INST]" + + inputs = self.processor(text=prompt, images=images, return_tensors="pt", use_fast=False).to(self.device) #.to("cuda") + generate_ids = self.vlm_model.generate( + **inputs, + max_new_tokens=self.max_new_tokens, + use_cache=True # Enables KV caching which can improve performance + ) + response = self.processor.batch_decode(generate_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False)[0] + print(f"response: {response}") + """ + _log.debug( + f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." + ) + """ + # inference_time = time.time() - start_time + # tokens_per_second = num_tokens / generation_time + # print("") + # print(f"Page Inference Time: {inference_time:.2f} seconds") + # print(f"Total tokens on page: {num_tokens:.2f}") + # print(f"Tokens/sec: {tokens_per_second:.2f}") + # print("") + page.predictions.vlm_response = VlmPrediction(text=response) + + yield page diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 5bfb82b2..5123fc38 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -22,8 +22,14 @@ from docling.datamodel.pipeline_options import ( ) from docling.datamodel.settings import settings from docling.models.api_vlm_model import ApiVlmModel -from docling.models.hf_mlx_model import HuggingFaceMlxModel -from docling.models.hf_vlm_model import HuggingFaceVlmModel + +# from docling.models.hf_vlm_model import HuggingFaceVlmModel +from docling.models.hf_vlm_models.hf_vlm_mlx_model import ( + HuggingFaceMlxModel +) +from docling.models.hf_vlm_models.hf_vlm_model_LlavaForConditionalGeneration import ( + HuggingFaceVlmModel_LlavaForConditionalGeneration +) from docling.models.hf_vlm_models.hf_vlm_model_AutoModelForCausalLM import ( HuggingFaceVlmModel_AutoModelForCausalLM, ) @@ -107,18 +113,20 @@ class VlmPipeline(PaginatedPipeline): vlm_options=vlm_options, ), ] - else: - _log.warning( - "falling back to HuggingFaceVlmModel_AutoModelForVision2Seq pipeline" - ) + elif ( + vlm_options.inference_framework + == InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration + ): self.build_pipe = [ - HuggingFaceVlmModel_AutoModelForVision2Seq( + HuggingFaceVlmModel_LlavaForConditionalGeneration( enabled=True, # must be always enabled for this pipeline to make sense. artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, vlm_options=vlm_options, ), - ] + ] + else: + raise ValueError(f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}") self.enrichment_pipe = [ # Other models working on `NodeItem` elements in the DoclingDocument diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 45ef9510..eebf6699 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -39,24 +39,56 @@ pipeline_options.force_backend_text = False ## Alternative VLM models: # pipeline_options.vlm_options = granite_vision_vlm_conversion_options -# pixtral_vlm_conversion_options = HuggingFaceVlmOptions( -# repo_id="mistralai/Pixtral-12B-Base-2409", -# # prompt="OCR the full page to markdown.", -# prompt="OCR this image and export it in MarkDown.", -# response_format=ResponseFormat.MARKDOWN, -# inference_framework=InferenceFramework.TRANSFORMERS, -# ) +""" +pixtral_vlm_conversion_options = HuggingFaceVlmOptions( + repo_id="mistralai/Pixtral-12B-Base-2409", + prompt="OCR this image and export it in MarkDown.", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration, +) +vlm_conversion_options = pixtral_vlm_conversion_options +""" pixtral_vlm_conversion_options = HuggingFaceVlmOptions( + repo_id="mistral-community/pixtral-12b", + prompt="OCR this image and export it in MarkDown.", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration, +) +vlm_conversion_options = pixtral_vlm_conversion_options + +""" +phi_vlm_conversion_options = HuggingFaceVlmOptions( repo_id="microsoft/Phi-4-multimodal-instruct", # prompt="OCR the full page to markdown.", prompt="OCR this image and export it in MarkDown.", response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM, ) +vlm_conversion_options = phi_vlm_conversion_options +""" +""" +pixtral_vlm_conversion_options = HuggingFaceVlmOptions( + repo_id="mlx-community/pixtral-12b-bf16", + prompt="Convert this full page to markdown. Do not miss any text and only output the bare MarkDown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.MLX, +) +vlm_conversion_options = pixtral_vlm_conversion_options +""" -pipeline_options.vlm_options = pixtral_vlm_conversion_options +""" +qwen_vlm_conversion_options = HuggingFaceVlmOptions( + repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16", + prompt="Convert this full page to markdown. Do not miss any text and only output the bare MarkDown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.MLX, +) +vlm_conversion_options = qwen_vlm_conversion_options +""" + +pipeline_options.vlm_options = vlm_conversion_options ## Set up pipeline for PDF or image inputs converter = DocumentConverter(