From 0c7c7c11c27e599dd088ca598f83810ffb3ef551 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Fri, 16 May 2025 16:31:11 +0200 Subject: [PATCH] reformatted the code Signed-off-by: Peter Staar --- .../pipeline_model_specializations.py | 4 +- docling/document_converter.py | 8 +- docling/models/hf_vlm_model.py | 3 +- .../models/hf_vlm_models/hf_vlm_mlx_model.py | 10 +-- .../hf_vlm_model_AutoModelForCausalLM.py | 12 +-- .../hf_vlm_model_AutoModelForVision2Seq.py | 6 +- ...vlm_model_LlavaForConditionalGeneration.py | 10 +-- docling/pipeline/vlm_pipeline.py | 76 ++++++++++--------- docs/examples/minimal_vlm_pipeline.py | 52 +++++++------ 9 files changed, 96 insertions(+), 85 deletions(-) diff --git a/docling/datamodel/pipeline_model_specializations.py b/docling/datamodel/pipeline_model_specializations.py index 77e6c2f2..12ebcb46 100644 --- a/docling/datamodel/pipeline_model_specializations.py +++ b/docling/datamodel/pipeline_model_specializations.py @@ -44,11 +44,11 @@ class HuggingFaceVlmOptions(BaseVlmOptions): inference_framework: InferenceFramework response_format: ResponseFormat - scale: float = 2.0 + scale: float = 2.0 temperature: float = 0.0 stop_strings: list[str] = [] - + use_kv_cache: bool = True max_new_tokens: int = 4096 diff --git a/docling/document_converter.py b/docling/document_converter.py index 25e6444e..e553c083 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -186,11 +186,11 @@ class DocumentConverter: Tuple[Type[BasePipeline], str], BasePipeline ] = {} - def _get_initialized_pipelines(self) -> dict[ - tuple[Type[BasePipeline], str], BasePipeline - ]: + def _get_initialized_pipelines( + self, + ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]: return self.initialized_pipelines - + def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str: """Generate a hash of pipeline options to use as part of the cache key.""" options_str = str(pipeline_options.model_dump()) diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py index 73e6f313..e82a34d0 100644 --- a/docling/models/hf_vlm_model.py +++ b/docling/models/hf_vlm_model.py @@ -6,7 +6,6 @@ _log = logging.getLogger(__name__) class HuggingFaceVlmModel: - @staticmethod def map_device_to_cpu_if_mlx(device: str) -> str: if device == "mps": @@ -16,7 +15,7 @@ class HuggingFaceVlmModel: return "cpu" return device - + @staticmethod def download_models( repo_id: str, diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py index 57abaa7e..4e724191 100644 --- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py +++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py @@ -30,7 +30,7 @@ class HuggingFaceMlxModel(BasePageModel): self.vlm_options = vlm_options self.max_tokens = vlm_options.max_new_tokens self.temperature = vlm_options.temperature - + if self.enabled: try: from mlx_vlm import generate, load # type: ignore @@ -76,8 +76,6 @@ class HuggingFaceMlxModel(BasePageModel): assert page.size is not None hi_res_image = page.get_image(scale=self.vlm_options.scale) - hi_res_image.save("./scratch/page.png") - if hi_res_image is not None: im_width, im_height = hi_res_image.size @@ -128,8 +126,10 @@ class HuggingFaceMlxModel(BasePageModel): ) ) else: - _log.warning(f"incompatible shape for logprobs: {token.logprobs.shape}") - + _log.warning( + f"incompatible shape for logprobs: {token.logprobs.shape}" + ) + output += token.text if "" in token.text: break diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py index 213a5a28..e0c09c88 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py @@ -42,9 +42,9 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): ) self.device = decide_device(accelerator_options.device) - self.device = HuggingFaceVlmMode.map_device_to_cpu_if_mlx(self.device) + self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device) _log.debug(f"Available device for VLM: {self.device}") - + self.use_cache = vlm_options.use_kv_cache self.max_new_tokens = vlm_options.max_new_tokens self.temperature = vlm_options.temperature @@ -120,14 +120,14 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): if hi_res_image is not None: im_width, im_height = hi_res_image.size - + # Define prompt structure prompt = self.formulate_prompt() print(f"prompt: '{prompt}', size: {im_width}, {im_height}") inputs = self.processor( text=prompt, images=hi_res_image, return_tensors="pt" - ) #.to(self.device) + ) # .to(self.device) # Generate response start_time = time.time() @@ -153,7 +153,9 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): _log.debug( f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." ) - page.predictions.vlm_response = VlmPrediction(text=response, generation_time=generation_time) + page.predictions.vlm_response = VlmPrediction( + text=response, generation_time=generation_time + ) yield page diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py index b0c74aa8..69154d77 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py @@ -39,14 +39,14 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): ) self.device = decide_device(accelerator_options.device) - self.device = HuggingFaceVlmMode.map_device_to_cpu_if_mlx(self.device) + self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device) _log.debug(f"Available device for HuggingFace VLM: {self.device}") self.use_cache = vlm_options.use_kv_cache self.max_new_tokens = vlm_options.max_new_tokens self.temperature = vlm_options.temperature - + repo_cache_folder = vlm_options.repo_id.replace("/", "--") # PARAMETERS: @@ -122,7 +122,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") """ - + # Define prompt structure prompt = self.formulate_prompt() diff --git a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py index 1c286a8b..cd708b89 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py @@ -39,12 +39,12 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): ) self.device = decide_device(accelerator_options.device) - self.device = HuggingFaceVlmMode.map_device_to_cpu_if_mlx(self.device) + self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device) self.use_cache = vlm_options.use_kv_cache self.max_new_tokens = vlm_options.max_new_tokens self.temperature = vlm_options.temperature - + _log.debug(f"Available device for VLM: {self.device}") repo_cache_folder = vlm_options.repo_id.replace("/", "--") @@ -94,7 +94,7 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") """ - + images = [hi_res_image] # Define prompt structure @@ -113,7 +113,7 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): temperature=self.temperature, ) - #num_tokens = len(generate_ids[0]) + # num_tokens = len(generate_ids[0]) generation_time = time.time() - start_time response = self.processor.batch_decode( @@ -124,7 +124,7 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): page.predictions.vlm_response = VlmPrediction( text=response, - #generated_tokens=num_tokens, + # generated_tokens=num_tokens, generation_time=generation_time, ) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index b902bb2e..e9abae6d 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -1,11 +1,23 @@ -import re import logging +import re from io import BytesIO from pathlib import Path from typing import List, Optional, Union, cast # from docling_core.types import DoclingDocument -from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem +from docling_core.types.doc import ( + BoundingBox, + DocItem, + DoclingDocument, + ImageRef, + PictureItem, + ProvenanceItem, + TextItem, +) +from docling_core.types.doc.base import ( + BoundingBox, + Size, +) from docling_core.types.doc.document import DocTagsDocument from PIL import Image as PILImage @@ -20,14 +32,6 @@ from docling.datamodel.pipeline_model_specializations import ( InferenceFramework, ResponseFormat, ) -from docling_core.types.doc.base import ( - Size, - BoundingBox, -) -from docling_core.types.doc import ( - ProvenanceItem, - DoclingDocument -) from docling.datamodel.pipeline_options import ( VlmPipelineOptions, ) @@ -168,6 +172,7 @@ class VlmPipeline(PaginatedPipeline): self.pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS ): + """ doctags_list = [] image_list = [] for page in conv_res.pages: @@ -207,6 +212,9 @@ class VlmPipeline(PaginatedPipeline): txt = self.extract_text_from_backend(page, crop_bbox) element.text = txt element.orig = txt + """ + conv_res.document = self._turn_dt_into_doc(conv_res) + elif ( self.pipeline_options.vlm_options.response_format == ResponseFormat.MARKDOWN @@ -271,21 +279,18 @@ class VlmPipeline(PaginatedPipeline): if self.force_backend_text: scale = self.pipeline_options.images_scale for element, _level in conv_res.document.iterate_items(): - if (not isinstance(element, TextItem) - or len(element.prov) == 0 - ): + if not isinstance(element, TextItem) or len(element.prov) == 0: continue crop_bbox = ( element.prov[0] .bbox.scaled(scale=scale) - .to_top_left_origin( - page_height=page.size.height * scale - ) + .to_top_left_origin(page_height=page.size.height * scale) ) txt = self.extract_text_from_backend(page, crop_bbox) element.text = txt element.orig = txt - + + return conv_res.document """ def _turn_md_into_doc(self, conv_res): @@ -308,45 +313,40 @@ class VlmPipeline(PaginatedPipeline): """ def _turn_md_into_doc(self, conv_res): - def _extract_markdown_code(text): """ Extracts text from markdown code blocks (enclosed in triple backticks). If no code blocks are found, returns the original text. - + Args: text (str): Input text that may contain markdown code blocks - + Returns: str: Extracted code if code blocks exist, otherwise original text """ # Regex pattern to match content between triple backticks # This handles multiline content and optional language specifier - pattern = r'^```(?:\w*\n)?(.*?)```(\n)*$' - - # Search for matches with DOTALL flag to match across multiple lines - matches = re.findall(pattern, text, re.DOTALL) + pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$" # Search with DOTALL flag to match across multiple lines mtch = re.search(pattern, text, re.DOTALL) - + if mtch: # Return only the content of the first capturing group return mtch.group(1) else: # No code blocks found, return original text return text - - for pg_idx, page in enumerate(conv_res.pages): - page_no = pg_idx+1 # FIXME: might be incorrect - + for pg_idx, page in enumerate(conv_res.pages): + page_no = pg_idx + 1 # FIXME: might be incorrect + predicted_text = "" if page.predictions.vlm_response: predicted_text = page.predictions.vlm_response.text + "\n\n" predicted_text = _extract_markdown_code(text=predicted_text) - + response_bytes = BytesIO(predicted_text.encode("utf8")) out_doc = InputDocument( path_or_stream=response_bytes, @@ -370,20 +370,24 @@ class VlmPipeline(PaginatedPipeline): conv_res.document.add_page( page_no=page_no, size=Size(width=pg_width, height=pg_height), - image=ImageRef.from_pil(image=page.image, dpi=72) if page.image else None, + image=ImageRef.from_pil(image=page.image, dpi=72) + if page.image + else None, ) - + for item, level in page_doc.iterate_items(): item.prov = [ - ProvenanceItem(page_no=pg_idx+1, - bbox=BoundingBox(t=0.0, b=0.0, l=0.0, r=0.0), - charspan=[0,0]) + ProvenanceItem( + page_no=pg_idx + 1, + bbox=BoundingBox(t=0.0, b=0.0, l=0.0, r=0.0), + charspan=[0, 0], + ) ] conv_res.document.append_child_item(child=item) print(item) return conv_res.document - + @classmethod def get_default_options(cls) -> VlmPipelineOptions: return VlmPipelineOptions() diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index be2afe06..fa79fcc9 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -4,6 +4,7 @@ from pathlib import Path from docling_core.types.doc import DocItemLabel, ImageRefMode from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS +from tabulate import tabulate from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_model_specializations import ( @@ -25,8 +26,6 @@ from docling.datamodel.pipeline_options import ( from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline -from tabulate import tabulate - ## Use experimental VlmPipeline pipeline_options = VlmPipelineOptions() # If force_backend_text = True, text from backend will be used instead of generated text @@ -101,19 +100,20 @@ qwen_vlm_conversion_options = HuggingFaceVlmOptions( pipeline_options.vlm_options = qwen_vlm_conversion_options """ + def convert(sources: list[Path], converter): for source in sources: - #start_time = time.time() + # start_time = time.time() print("================================================") print(f"Processing... {source}") print("================================================") print("") res = converter.convert(source) - + print("") # print(res.document.export_to_markdown()) - + model_id = pipeline_options.vlm_options.repo_id.replace("/", "_") framework = pipeline_options.vlm_options.inference_framework fname = f"{res.input.file.stem}-{model_id}-{framework}" @@ -127,7 +127,7 @@ def convert(sources: list[Path], converter): ) print(page.predictions.vlm_response.text) print(" ---------- ") - + print("===== Final output of the converted document =======") with (out_path / f"{fname}.json").open("w") as fp: @@ -152,7 +152,7 @@ def convert(sources: list[Path], converter): split_page_view=True, ) print(f" => produced {out_path / fname}.html") - + pg_num = res.document.num_pages() print("") print( @@ -161,18 +161,24 @@ def convert(sources: list[Path], converter): print("====================================================") # return [source, f"{out_path / fname}.html", model_id, framework, inference_time, ] - return [source, model_id, framework, pg_num, inference_time, ] - -if __name__ == "__main__": + return [ + source, + model_id, + framework, + pg_num, + inference_time, + ] + +if __name__ == "__main__": sources = [ # "tests/data/2305.03393v1-pg9-img.png", "tests/data/pdf/2305.03393v1-pg9.pdf", ] - + out_path = Path("scratch") out_path.mkdir(parents=True, exist_ok=True) - + ## Use VlmPipeline pipeline_options = VlmPipelineOptions() @@ -186,16 +192,16 @@ if __name__ == "__main__": rows = [] for vlm_options in [ - # smoldocling_vlm_conversion_options, \ - smoldocling_vlm_mlx_conversion_options, \ - # granite_vision_vlm_conversion_options, \ - # phi_vlm_conversion_options, \ - # qwen25_vl_3b_vlm_mlx_conversion_options, \ - # pixtral_12b_vlm_mlx_conversion_options, - # pixtral_12b_vlm_conversion_options, + # smoldocling_vlm_conversion_options, \ + smoldocling_vlm_mlx_conversion_options, + # granite_vision_vlm_conversion_options, \ + # phi_vlm_conversion_options, \ + # qwen25_vl_3b_vlm_mlx_conversion_options, \ + # pixtral_12b_vlm_mlx_conversion_options, + # pixtral_12b_vlm_conversion_options, ]: pipeline_options.vlm_options = vlm_options - + ## Set up pipeline for PDF or image inputs converter = DocumentConverter( format_options={ @@ -209,12 +215,12 @@ if __name__ == "__main__": ), }, ) - + row = convert(sources=sources, converter=converter) print("pipelines: \n", converter._get_initialized_pipelines()) - + rows.append(row) - + print(tabulate(rows)) print("see if memory gets released ...")