finalising last points for vlms support

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-12-16 08:38:14 +00:00 · 2025-05-16 12:39:26 +02:00
parent fc61258273
commit d41b856961
6 changed files with 246 additions and 64 deletions
--- a/docling/datamodel/pipeline_model_specializations.py
+++ b/docling/datamodel/pipeline_model_specializations.py
@@ -64,6 +64,7 @@ class ApiVlmOptions(BaseVlmOptions):
    params: Dict[str, Any] = {}
    scale: float = 2.0
    timeout: float = 60
    concurrency: int = 1
    response_format: ResponseFormat
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -186,6 +186,11 @@ class DocumentConverter:
            Tuple[Type[BasePipeline], str], BasePipeline
        ] = {}
    def _get_initialized_pipelines(self) -> dict[
            tuple[Type[BasePipeline], str], BasePipeline
        ]:
        return self.initialized_pipelines
    def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
        """Generate a hash of pipeline options to use as part of the cache key."""
        options_str = str(pipeline_options.model_dump())
--- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
+++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
@@ -71,7 +71,7 @@ class HuggingFaceMlxModel(BasePageModel):
            if not page._backend.is_valid():
                yield page
            else:
-                with TimeRecorder(conv_res, "vlm"):
+                with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
                    assert page.size is not None
                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
@@ -124,6 +124,8 @@ class HuggingFaceMlxModel(BasePageModel):
                                    logprob=token.logprobs[0, token.token],
                                )
                            )
                        else:
                            _log.warning(f"incompatible shape for logprobs: {token.logprobs.shape}")
                        output += token.text
                        if "</doctag>" in token.text:
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
@@ -141,7 +141,10 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                    _log.debug(
                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                    )
-                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
+                    page.predictions.vlm_response = VlmPrediction(
                        text=page_tags,
                        generation_time=generation_time,
                    )
                yield page
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -1,3 +1,4 @@
 import re
 import logging
 from io import BytesIO
 from pathlib import Path
@@ -19,6 +20,14 @@ from docling.datamodel.pipeline_model_specializations import (
    InferenceFramework,
    ResponseFormat,
 )
 from docling_core.types.doc.base import (
    Size,
    BoundingBox,    
 )
 from docling_core.types.doc import (
    ProvenanceItem,
    DoclingDocument
 )
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
 )
@@ -237,6 +246,48 @@ class VlmPipeline(PaginatedPipeline):
        return conv_res
    def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
        doctags_list = []
        image_list = []
        for page in conv_res.pages:
            predicted_doctags = ""
            img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
            if page.predictions.vlm_response:
                predicted_doctags = page.predictions.vlm_response.text
            if page.image:
                img = page.image
            image_list.append(img)
            doctags_list.append(predicted_doctags)
        doctags_list_c = cast(List[Union[Path, str]], doctags_list)
        image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
        doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
            doctags_list_c, image_list_c
        )
        conv_res.document.load_from_doctags(doctags_doc)
        # If forced backend text, replace model predicted text with backend one
        if page.size:
            if self.force_backend_text:
                scale = self.pipeline_options.images_scale
                for element, _level in conv_res.document.iterate_items():
                    if (not isinstance(element, TextItem)
                        or len(element.prov) == 0
                    ):
                        continue
                    crop_bbox = (
                        element.prov[0]
                        .bbox.scaled(scale=scale)
                        .to_top_left_origin(
                            page_height=page.size.height * scale
                        )
                    )
                    txt = self.extract_text_from_backend(page, crop_bbox)
                    element.text = txt
                    element.orig = txt
    """
    def _turn_md_into_doc(self, conv_res):
        predicted_text = ""
        for pg_idx, page in enumerate(conv_res.pages):
@@ -254,6 +305,84 @@ class VlmPipeline(PaginatedPipeline):
            path_or_stream=response_bytes,
        )
        return backend.convert()
    """
    def _turn_md_into_doc(self, conv_res):
        def _extract_markdown_code(text):
            """
            Extracts text from markdown code blocks (enclosed in triple backticks).
            If no code blocks are found, returns the original text.
            Args:
                text (str): Input text that may contain markdown code blocks
            Returns:
                str: Extracted code if code blocks exist, otherwise original text
            """
            # Regex pattern to match content between triple backticks
            # This handles multiline content and optional language specifier
            pattern = r'^```(?:\w*\n)?(.*?)```(\n)*$'
            # Search for matches with DOTALL flag to match across multiple lines
            matches = re.findall(pattern, text, re.DOTALL)
            # Search with DOTALL flag to match across multiple lines
            mtch = re.search(pattern, text, re.DOTALL)
            if mtch:
                # Return only the content of the first capturing group
                return mtch.group(1)
            else:
                # No code blocks found, return original text
                return text
        for pg_idx, page in enumerate(conv_res.pages):
            page_no = pg_idx+1 # FIXME: might be incorrect
            predicted_text = ""
            if page.predictions.vlm_response:
                predicted_text = page.predictions.vlm_response.text + "\n\n"
            predicted_text = _extract_markdown_code(text=predicted_text)
            response_bytes = BytesIO(predicted_text.encode("utf8"))
            out_doc = InputDocument(
                path_or_stream=response_bytes,
                filename=conv_res.input.file.name,
                format=InputFormat.MD,
                backend=MarkdownDocumentBackend,
            )
            backend = MarkdownDocumentBackend(
                in_doc=out_doc,
                path_or_stream=response_bytes,
            )
            page_doc = backend.convert()
            if page.image is not None:
                pg_width = page.image.width
                pg_height = page.image.height
            else:
                pg_width = 1
                pg_height = 1
            conv_res.document.add_page(
                page_no=page_no,
                size=Size(width=pg_width, height=pg_height),
                image=ImageRef.from_pil(image=page.image, dpi=72) if page.image else None,
            )
            for item, level in page_doc.iterate_items():
                item.prov = [
                    ProvenanceItem(page_no=pg_idx+1,
                                   bbox=BoundingBox(t=0.0, b=0.0, l=0.0, r=0.0),
                                   charspan=[0,0])
                ]
                conv_res.document.append_child_item(child=item)
                print(item)
        return conv_res.document
    @classmethod
    def get_default_options(cls) -> VlmPipelineOptions:
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -25,10 +25,7 @@ from docling.datamodel.pipeline_options import (
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
-sources = [
+from tabulate import tabulate
    # "tests/data/2305.03393v1-pg9-img.png",
    "tests/data/pdf/2305.03393v1-pg9.pdf",
 ]
 ## Use experimental VlmPipeline
 pipeline_options = VlmPipelineOptions()
@@ -104,25 +101,9 @@ qwen_vlm_conversion_options = HuggingFaceVlmOptions(
 pipeline_options.vlm_options = qwen_vlm_conversion_options
 """
-## Set up pipeline for PDF or image inputs
+def convert(sources: list[Path], converter):
 converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
        InputFormat.IMAGE: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
    },
 )
 out_path = Path("scratch")
 out_path.mkdir(parents=True, exist_ok=True)
    for source in sources:
-    start_time = time.time()
+        #start_time = time.time()
        print("================================================")
        print(f"Processing... {source}")
        print("================================================")
@@ -134,12 +115,15 @@ for source in sources:
        # print(res.document.export_to_markdown())
        model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
-    fname = f"{model_id}-{res.input.file.stem}"
+        framework = pipeline_options.vlm_options.inference_framework
        fname = f"{res.input.file.stem}-{model_id}-{framework}"
        inference_time = 0.0
        for i, page in enumerate(res.pages):
            inference_time += page.predictions.vlm_response.generation_time
            print("")
            print(
-            f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format}:"
+                f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format} in {page.predictions.vlm_response.generation_time} [sec]:"
            )
            print(page.predictions.vlm_response.text)
            print(" ---------- ")
@@ -171,8 +155,66 @@ for source in sources:
        pg_num = res.document.num_pages()
        print("")
    inference_time = time.time() - start_time
        print(
            f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
        )
        print("====================================================")
        # return [source, f"{out_path / fname}.html", model_id, framework, inference_time, ]
        return [source, model_id, framework, pg_num, inference_time, ]
 if __name__ == "__main__":
    sources = [
        # "tests/data/2305.03393v1-pg9-img.png",
        "tests/data/pdf/2305.03393v1-pg9.pdf",
    ]
    out_path = Path("scratch")
    out_path.mkdir(parents=True, exist_ok=True)
    ## Use VlmPipeline
    pipeline_options = VlmPipelineOptions()
    # If force_backend_text = True, text from backend will be used instead of generated text
    pipeline_options.force_backend_text = False
    pipeline_options.generate_page_images = True
    ## On GPU systems, enable flash_attention_2 with CUDA:
    # pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
    # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
    rows = []
    for vlm_options in [
            # smoldocling_vlm_conversion_options, \
            smoldocling_vlm_mlx_conversion_options, \
            granite_vision_vlm_conversion_options, \
            # phi_vlm_conversion_options, \
            qwen25_vl_3b_vlm_mlx_conversion_options, \
            pixtral_12b_vlm_mlx_conversion_options,
    ]:
        pipeline_options.vlm_options = vlm_options
        ## Set up pipeline for PDF or image inputs
        converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_cls=VlmPipeline,
                    pipeline_options=pipeline_options,
                ),
                InputFormat.IMAGE: PdfFormatOption(
                    pipeline_cls=VlmPipeline,
                    pipeline_options=pipeline_options,
                ),
            },
        )
        row = convert(sources=sources, converter=converter)
        print("pipelines: \n", converter._get_initialized_pipelines())
        rows.append(row)
        print(tabulate(rows))
        print("see if memory gets released ...")
        time.sleep(10)