From d41b856961838d897dd0e596fc369d3814cc4832 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Fri, 16 May 2025 12:39:26 +0200 Subject: [PATCH] finalising last points for vlms support Signed-off-by: Peter Staar --- .../pipeline_model_specializations.py | 1 + docling/document_converter.py | 5 + .../models/hf_vlm_models/hf_vlm_mlx_model.py | 6 +- .../hf_vlm_model_AutoModelForVision2Seq.py | 5 +- docling/pipeline/vlm_pipeline.py | 129 ++++++++++++++ docs/examples/minimal_vlm_pipeline.py | 164 +++++++++++------- 6 files changed, 246 insertions(+), 64 deletions(-) diff --git a/docling/datamodel/pipeline_model_specializations.py b/docling/datamodel/pipeline_model_specializations.py index 68db935d..85aef998 100644 --- a/docling/datamodel/pipeline_model_specializations.py +++ b/docling/datamodel/pipeline_model_specializations.py @@ -64,6 +64,7 @@ class ApiVlmOptions(BaseVlmOptions): params: Dict[str, Any] = {} scale: float = 2.0 timeout: float = 60 + concurrency: int = 1 response_format: ResponseFormat diff --git a/docling/document_converter.py b/docling/document_converter.py index 08095d43..25e6444e 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -186,6 +186,11 @@ class DocumentConverter: Tuple[Type[BasePipeline], str], BasePipeline ] = {} + def _get_initialized_pipelines(self) -> dict[ + tuple[Type[BasePipeline], str], BasePipeline + ]: + return self.initialized_pipelines + def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str: """Generate a hash of pipeline options to use as part of the cache key.""" options_str = str(pipeline_options.model_dump()) diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py index bc9a9317..73144404 100644 --- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py +++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py @@ -71,7 +71,7 @@ class HuggingFaceMlxModel(BasePageModel): if not page._backend.is_valid(): yield page else: - with TimeRecorder(conv_res, "vlm"): + with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"): assert page.size is not None hi_res_image = page.get_image(scale=self.vlm_options.scale) @@ -124,7 +124,9 @@ class HuggingFaceMlxModel(BasePageModel): logprob=token.logprobs[0, token.token], ) ) - + else: + _log.warning(f"incompatible shape for logprobs: {token.logprobs.shape}") + output += token.text if "" in token.text: break diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py index 6b9f352b..6633c842 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py @@ -141,7 +141,10 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): _log.debug( f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." ) - page.predictions.vlm_response = VlmPrediction(text=page_tags) + page.predictions.vlm_response = VlmPrediction( + text=page_tags, + generation_time=generation_time, + ) yield page diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 39d6bb33..b902bb2e 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -1,3 +1,4 @@ +import re import logging from io import BytesIO from pathlib import Path @@ -19,6 +20,14 @@ from docling.datamodel.pipeline_model_specializations import ( InferenceFramework, ResponseFormat, ) +from docling_core.types.doc.base import ( + Size, + BoundingBox, +) +from docling_core.types.doc import ( + ProvenanceItem, + DoclingDocument +) from docling.datamodel.pipeline_options import ( VlmPipelineOptions, ) @@ -237,6 +246,48 @@ class VlmPipeline(PaginatedPipeline): return conv_res + def _turn_dt_into_doc(self, conv_res) -> DoclingDocument: + doctags_list = [] + image_list = [] + for page in conv_res.pages: + predicted_doctags = "" + img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)") + if page.predictions.vlm_response: + predicted_doctags = page.predictions.vlm_response.text + if page.image: + img = page.image + image_list.append(img) + doctags_list.append(predicted_doctags) + + doctags_list_c = cast(List[Union[Path, str]], doctags_list) + image_list_c = cast(List[Union[Path, PILImage.Image]], image_list) + doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( + doctags_list_c, image_list_c + ) + conv_res.document.load_from_doctags(doctags_doc) + + # If forced backend text, replace model predicted text with backend one + if page.size: + if self.force_backend_text: + scale = self.pipeline_options.images_scale + for element, _level in conv_res.document.iterate_items(): + if (not isinstance(element, TextItem) + or len(element.prov) == 0 + ): + continue + crop_bbox = ( + element.prov[0] + .bbox.scaled(scale=scale) + .to_top_left_origin( + page_height=page.size.height * scale + ) + ) + txt = self.extract_text_from_backend(page, crop_bbox) + element.text = txt + element.orig = txt + + + """ def _turn_md_into_doc(self, conv_res): predicted_text = "" for pg_idx, page in enumerate(conv_res.pages): @@ -254,7 +305,85 @@ class VlmPipeline(PaginatedPipeline): path_or_stream=response_bytes, ) return backend.convert() + """ + def _turn_md_into_doc(self, conv_res): + + def _extract_markdown_code(text): + """ + Extracts text from markdown code blocks (enclosed in triple backticks). + If no code blocks are found, returns the original text. + + Args: + text (str): Input text that may contain markdown code blocks + + Returns: + str: Extracted code if code blocks exist, otherwise original text + """ + # Regex pattern to match content between triple backticks + # This handles multiline content and optional language specifier + pattern = r'^```(?:\w*\n)?(.*?)```(\n)*$' + + # Search for matches with DOTALL flag to match across multiple lines + matches = re.findall(pattern, text, re.DOTALL) + + # Search with DOTALL flag to match across multiple lines + mtch = re.search(pattern, text, re.DOTALL) + + if mtch: + # Return only the content of the first capturing group + return mtch.group(1) + else: + # No code blocks found, return original text + return text + + for pg_idx, page in enumerate(conv_res.pages): + + page_no = pg_idx+1 # FIXME: might be incorrect + + predicted_text = "" + if page.predictions.vlm_response: + predicted_text = page.predictions.vlm_response.text + "\n\n" + + predicted_text = _extract_markdown_code(text=predicted_text) + + response_bytes = BytesIO(predicted_text.encode("utf8")) + out_doc = InputDocument( + path_or_stream=response_bytes, + filename=conv_res.input.file.name, + format=InputFormat.MD, + backend=MarkdownDocumentBackend, + ) + backend = MarkdownDocumentBackend( + in_doc=out_doc, + path_or_stream=response_bytes, + ) + page_doc = backend.convert() + + if page.image is not None: + pg_width = page.image.width + pg_height = page.image.height + else: + pg_width = 1 + pg_height = 1 + + conv_res.document.add_page( + page_no=page_no, + size=Size(width=pg_width, height=pg_height), + image=ImageRef.from_pil(image=page.image, dpi=72) if page.image else None, + ) + + for item, level in page_doc.iterate_items(): + item.prov = [ + ProvenanceItem(page_no=pg_idx+1, + bbox=BoundingBox(t=0.0, b=0.0, l=0.0, r=0.0), + charspan=[0,0]) + ] + conv_res.document.append_child_item(child=item) + print(item) + + return conv_res.document + @classmethod def get_default_options(cls) -> VlmPipelineOptions: return VlmPipelineOptions() diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 9c04d561..1310637d 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -25,10 +25,7 @@ from docling.datamodel.pipeline_options import ( from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline -sources = [ - # "tests/data/2305.03393v1-pg9-img.png", - "tests/data/pdf/2305.03393v1-pg9.pdf", -] +from tabulate import tabulate ## Use experimental VlmPipeline pipeline_options = VlmPipelineOptions() @@ -104,75 +101,120 @@ qwen_vlm_conversion_options = HuggingFaceVlmOptions( pipeline_options.vlm_options = qwen_vlm_conversion_options """ -## Set up pipeline for PDF or image inputs -converter = DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_cls=VlmPipeline, - pipeline_options=pipeline_options, - ), - InputFormat.IMAGE: PdfFormatOption( - pipeline_cls=VlmPipeline, - pipeline_options=pipeline_options, - ), - }, -) +def convert(sources: list[Path], converter): + for source in sources: + #start_time = time.time() + print("================================================") + print(f"Processing... {source}") + print("================================================") + print("") -out_path = Path("scratch") -out_path.mkdir(parents=True, exist_ok=True) + res = converter.convert(source) + + print("") + # print(res.document.export_to_markdown()) + + model_id = pipeline_options.vlm_options.repo_id.replace("/", "_") + framework = pipeline_options.vlm_options.inference_framework + fname = f"{res.input.file.stem}-{model_id}-{framework}" -for source in sources: - start_time = time.time() - print("================================================") - print(f"Processing... {source}") - print("================================================") - print("") + inference_time = 0.0 + for i, page in enumerate(res.pages): + inference_time += page.predictions.vlm_response.generation_time + print("") + print( + f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format} in {page.predictions.vlm_response.generation_time} [sec]:" + ) + print(page.predictions.vlm_response.text) + print(" ---------- ") + + print("===== Final output of the converted document =======") - res = converter.convert(source) + with (out_path / f"{fname}.json").open("w") as fp: + fp.write(json.dumps(res.document.export_to_dict())) - print("") - # print(res.document.export_to_markdown()) + res.document.save_as_json( + out_path / f"{fname}.json", + image_mode=ImageRefMode.PLACEHOLDER, + ) + print(f" => produced {out_path / fname}.json") - model_id = pipeline_options.vlm_options.repo_id.replace("/", "_") - fname = f"{model_id}-{res.input.file.stem}" + res.document.save_as_markdown( + out_path / f"{fname}.md", + image_mode=ImageRefMode.PLACEHOLDER, + ) + print(f" => produced {out_path / fname}.md") - for i, page in enumerate(res.pages): + res.document.save_as_html( + out_path / f"{fname}.html", + image_mode=ImageRefMode.EMBEDDED, + labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], + split_page_view=True, + ) + print(f" => produced {out_path / fname}.html") + + pg_num = res.document.num_pages() print("") print( - f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format}:" + f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}" ) - print(page.predictions.vlm_response.text) - print(" ---------- ") + print("====================================================") - print("===== Final output of the converted document =======") + # return [source, f"{out_path / fname}.html", model_id, framework, inference_time, ] + return [source, model_id, framework, pg_num, inference_time, ] + +if __name__ == "__main__": - with (out_path / f"{fname}.json").open("w") as fp: - fp.write(json.dumps(res.document.export_to_dict())) + sources = [ + # "tests/data/2305.03393v1-pg9-img.png", + "tests/data/pdf/2305.03393v1-pg9.pdf", + ] + + out_path = Path("scratch") + out_path.mkdir(parents=True, exist_ok=True) + + ## Use VlmPipeline + pipeline_options = VlmPipelineOptions() - res.document.save_as_json( - out_path / f"{fname}.json", - image_mode=ImageRefMode.PLACEHOLDER, - ) - print(f" => produced {out_path / fname}.json") + # If force_backend_text = True, text from backend will be used instead of generated text + pipeline_options.force_backend_text = False + pipeline_options.generate_page_images = True - res.document.save_as_markdown( - out_path / f"{fname}.md", - image_mode=ImageRefMode.PLACEHOLDER, - ) - print(f" => produced {out_path / fname}.md") + ## On GPU systems, enable flash_attention_2 with CUDA: + # pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA + # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True - res.document.save_as_html( - out_path / f"{fname}.html", - image_mode=ImageRefMode.EMBEDDED, - labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], - split_page_view=True, - ) - print(f" => produced {out_path / fname}.html") + rows = [] + for vlm_options in [ + # smoldocling_vlm_conversion_options, \ + smoldocling_vlm_mlx_conversion_options, \ + granite_vision_vlm_conversion_options, \ + # phi_vlm_conversion_options, \ + qwen25_vl_3b_vlm_mlx_conversion_options, \ + pixtral_12b_vlm_mlx_conversion_options, + ]: + pipeline_options.vlm_options = vlm_options + + ## Set up pipeline for PDF or image inputs + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ), + InputFormat.IMAGE: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ), + }, + ) + + row = convert(sources=sources, converter=converter) + print("pipelines: \n", converter._get_initialized_pipelines()) + + rows.append(row) + + print(tabulate(rows)) - pg_num = res.document.num_pages() - print("") - inference_time = time.time() - start_time - print( - f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}" - ) - print("====================================================") + print("see if memory gets released ...") + time.sleep(10)