feat(SmolDocling): Support MLX acceleration in VLM pipeline (#1199)

* Initial implementation to support MLX for VLM pipeline and SmolDocling Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * mlx_model unit Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Add CLI choices for VLM pipeline and model Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Initial implementation to support MLX for VLM pipeline and SmolDocling Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * mlx_model unit Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Add CLI choices for VLM pipeline and model Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updated minimal vlm pipeline example Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * make vlm_pipeline python3.9 compatible Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixed extract_text_from_backend definition Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated README Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated example Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated documentation Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * corrections in the documentation Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Consmetic changes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2025-03-19 15:38:54 +01:00
parent b454aa1551
commit 1c26769785
9 changed files with 319 additions and 66 deletions
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -10,13 +10,15 @@ from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
    granite_vision_vlm_conversion_options,
    smoldocling_vlm_conversion_options,
+    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

 sources = [
-    "tests/data/2305.03393v1-pg9-img.png",
+    # "tests/data/2305.03393v1-pg9-img.png",
+    "tests/data/pdf/2305.03393v1-pg9.pdf",
 ]

 ## Use experimental VlmPipeline
@@ -29,7 +31,10 @@ pipeline_options.force_backend_text = False
 # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True

 ## Pick a VLM model. We choose SmolDocling-256M by default
-pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+# pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+
+## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
+pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options

 ## Alternative VLM models:
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options
@@ -63,9 +68,6 @@ for source in sources:

    res = converter.convert(source)

-    print("------------------------------------------------")
-    print("MD:")
-    print("------------------------------------------------")
    print("")
    print(res.document.export_to_markdown())

@@ -83,8 +85,17 @@ for source in sources:
    with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
        fp.write(json.dumps(res.document.export_to_dict()))

-    pg_num = res.document.num_pages()
+    res.document.save_as_json(
+        out_path / f"{res.input.file.stem}.md",
+        image_mode=ImageRefMode.PLACEHOLDER,
+    )

+    res.document.save_as_markdown(
+        out_path / f"{res.input.file.stem}.md",
+        image_mode=ImageRefMode.PLACEHOLDER,
+    )
+
+    pg_num = res.document.num_pages()
    print("")
    inference_time = time.time() - start_time
    print(