diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index aaefc3a1..0d28791e 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -155,7 +155,7 @@ class VlmPredictionToken(BaseModel): class VlmPrediction(BaseModel): text: str = "" - generated_tokens: list[VlmPredictionToken] = -1 + generated_tokens: list[VlmPredictionToken] = [] generation_time: float = -1 diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index edbf7b58..4aaa8ec5 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -261,6 +261,7 @@ class BaseVlmOptions(BaseModel): class ResponseFormat(str, Enum): DOCTAGS = "doctags" MARKDOWN = "markdown" + HTML = "html" class InferenceFramework(str, Enum): @@ -285,6 +286,11 @@ class HuggingFaceVlmOptions(BaseVlmOptions): inference_framework: InferenceFramework response_format: ResponseFormat + scale: float = 2.0 + + use_kv_cache: bool = True + max_new_tokens: int = 4096 + @property def repo_cache_folder(self) -> str: return self.repo_id.replace("/", "--") diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py index ad814ec6..2abe37be 100644 --- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py +++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py @@ -28,8 +28,7 @@ class HuggingFaceMlxModel(BasePageModel): self.enabled = enabled self.vlm_options = vlm_options - - self.max_tokens=4096 + self.max_tokens = vlm_options.max_new_tokens if self.enabled: try: @@ -42,7 +41,6 @@ class HuggingFaceMlxModel(BasePageModel): ) repo_cache_folder = vlm_options.repo_id.replace("/", "--") - _log.debug(f"model init: {repo_cache_folder}") self.apply_chat_template = apply_chat_template self.stream_generate = stream_generate @@ -52,7 +50,6 @@ class HuggingFaceMlxModel(BasePageModel): _log.debug( f"before HuggingFaceVlmModel.download_models: {self.vlm_options.repo_id}" ) - # artifacts_path = self.download_models(self.vlm_options.repo_id) artifacts_path = HuggingFaceVlmModel.download_models( self.vlm_options.repo_id, progress=True, @@ -60,39 +57,12 @@ class HuggingFaceMlxModel(BasePageModel): elif (artifacts_path / repo_cache_folder).exists(): artifacts_path = artifacts_path / repo_cache_folder - _log.debug(f"downloaded model: {artifacts_path}") - - self.param_question = vlm_options.prompt # "Perform Layout Analysis." + self.param_question = vlm_options.prompt ## Load the model - _log.debug("start loading model ...") - self.vlm_model, self.processor = load(artifacts_path) - _log.debug("loaded model ...") + self.vlm_model, self.processor = load(artifacts_path) self.config = load_config(artifacts_path) - """ - @staticmethod - def download_models( - repo_id: str, - local_dir: Optional[Path] = None, - force: bool = False, - progress: bool = False, - ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( - repo_id=repo_id, - force_download=force, - local_dir=local_dir, - # revision="v0.0.1", - ) - - return Path(download_path) - """ - def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: @@ -104,8 +74,7 @@ class HuggingFaceMlxModel(BasePageModel): with TimeRecorder(conv_res, "vlm"): assert page.size is not None - hi_res_image = page.get_image(scale=2.0) # 144dpi - # hi_res_image = page.get_image(scale=1.0) # 72dpi + hi_res_image = page.get_image(scale=self.vlm_options.scale) if hi_res_image is not None: im_width, im_height = hi_res_image.size @@ -136,7 +105,6 @@ class HuggingFaceMlxModel(BasePageModel): max_tokens=4096, verbose=False, ): - print(token.logprobs.shape) if len(token.logprobs.shape)==1: tokens.append(VlmPredictionToken(text=token.text, token=token.token, @@ -145,20 +113,15 @@ class HuggingFaceMlxModel(BasePageModel): tokens.append(VlmPredictionToken(text=token.text, token=token.token, logprob=token.logprobs[0, token.token])) - - - # print(token.text, end="", flush=True) - output += token.text + output += token.text if "" in token.text: break generation_time = time.time() - start_time page_tags = output - - print(tokens) - _log.debug(f"Generation time {generation_time:.2f} seconds.") + _log.debug(f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens)/generation_time} tokens/sec).") page.predictions.vlm_response = VlmPrediction(text=page_tags, generation_time=generation_time, generated_tokens=tokens) diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py index ed99d259..3299205d 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py @@ -42,17 +42,19 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): ) self.device = decide_device(accelerator_options.device) - self.device = "cpu" # FIXME - self.use_cache = True - self.max_new_tokens = 64 # FIXME + if self.device=="mlx": + _log.warning(f"Mapping mlx to cpu for AutoModelForCausalLM") + self.device = cpu + + self.use_cache = vlm_options.use_kv_cache + self.max_new_tokens = vlm_options.max_new_tokens _log.debug(f"Available device for VLM: {self.device}") repo_cache_folder = vlm_options.repo_id.replace("/", "--") # PARAMETERS: if artifacts_path is None: - # artifacts_path = self.download_models(self.vlm_options.repo_id) artifacts_path = HuggingFaceVlmModel.download_models( self.vlm_options.repo_id ) @@ -100,7 +102,6 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): ).to(self.device) model_path = artifacts_path - print(f"model: {model_path}") # Load generation config self.generation_config = GenerationConfig.from_pretrained(model_path) @@ -116,7 +117,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): with TimeRecorder(conv_res, "vlm"): assert page.size is not None - hi_res_image = page.get_image(scale=2.0) # 144dpi + hi_res_image = page.get_image(scale=self.vlm_options.scale) # 144dpi # hi_res_image = page.get_image(scale=1.0) # 72dpi if hi_res_image is not None: diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 21f2c0dc..4681085f 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -11,6 +11,10 @@ from docling.datamodel.pipeline_options import ( InferenceFramework, ResponseFormat, VlmPipelineOptions, + smoldocling_vlm_mlx_conversion_options, + smoldocling_vlm_conversion_options, + granite_vision_vlm_conversion_options, + granite_vision_vlm_ollama_conversion_options, ) from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline @@ -33,7 +37,7 @@ pipeline_options.force_backend_text = False # pipeline_options.vlm_options = smoldocling_vlm_conversion_options ## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX -## pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options +pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options ## Alternative VLM models: # pipeline_options.vlm_options = granite_vision_vlm_conversion_options @@ -45,7 +49,7 @@ pixtral_vlm_conversion_options = HuggingFaceVlmOptions( response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration, ) -vlm_conversion_options = pixtral_vlm_conversion_options +pipeline_options.vlm_options = pixtral_vlm_conversion_options """ """ @@ -55,7 +59,7 @@ pixtral_vlm_conversion_options = HuggingFaceVlmOptions( response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration, ) -vlm_conversion_options = pixtral_vlm_conversion_options +pipeline_options.vlm_options = pixtral_vlm_conversion_options """ """ @@ -66,16 +70,19 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions( response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM, ) -vlm_conversion_options = phi_vlm_conversion_options +pipeline_options.vlm_options = phi_vlm_conversion_options """ +""" pixtral_vlm_conversion_options = HuggingFaceVlmOptions( repo_id="mlx-community/pixtral-12b-bf16", prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.MLX, + scale=1.0, ) -vlm_conversion_options = pixtral_vlm_conversion_options +pipeline_options.vlm_options = pixtral_vlm_conversion_options +""" """ qwen_vlm_conversion_options = HuggingFaceVlmOptions( @@ -84,11 +91,9 @@ qwen_vlm_conversion_options = HuggingFaceVlmOptions( response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.MLX, ) -vlm_conversion_options = qwen_vlm_conversion_options +pipeline_options.vlm_options = qwen_vlm_conversion_options """ -pipeline_options.vlm_options = vlm_conversion_options - ## Set up pipeline for PDF or image inputs converter = DocumentConverter( format_options={ @@ -116,19 +121,16 @@ for source in sources: res = converter.convert(source) print("") - print(res.document.export_to_markdown()) + #print(res.document.export_to_markdown()) - for page in res.pages: + for i,page in enumerate(res.pages): print("") - print(f"Predicted page in {pipeline_options.vlm_options.response_format}:") + print(f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format}:") print(page.predictions.vlm_response.text) + print(f" ---------- ") - res.document.save_as_html( - filename=Path(f"{out_path}/{res.input.file.stem}.html"), - image_mode=ImageRefMode.REFERENCED, - labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], - ) - + print("===== Final output of the converted document =======") + with (out_path / f"{res.input.file.stem}.json").open("w") as fp: fp.write(json.dumps(res.document.export_to_dict())) @@ -136,19 +138,27 @@ for source in sources: out_path / f"{res.input.file.stem}.json", image_mode=ImageRefMode.PLACEHOLDER, ) - + print(f" => produced {out_path / res.input.file.stem}.json") + res.document.save_as_markdown( out_path / f"{res.input.file.stem}.md", image_mode=ImageRefMode.PLACEHOLDER, ) - + print(f" => produced {out_path / res.input.file.stem}.md") + + res.document.save_as_html( + out_path / f"{res.input.file.stem}.html", + image_mode=ImageRefMode.EMBEDDED, + labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], + # split_page_view=True, + ) + print(f" => produced {out_path / res.input.file.stem}.html") + pg_num = res.document.num_pages() print("") inference_time = time.time() - start_time print( f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}" ) - -print("================================================") -print("done!") -print("================================================") + print("====================================================") +