From 8e2b0b39c132d0323be5091923b9cfe9130200a3 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 19 Mar 2025 13:18:04 +0100 Subject: [PATCH] Add CLI choices for VLM pipeline and model Signed-off-by: Christoph Auer --- docling/cli/main.py | 119 ++++++++++++++++++-------- docling/datamodel/pipeline_options.py | 14 ++- docling/models/hf_mlx_model.py | 13 ++- docling/pipeline/vlm_pipeline.py | 6 -- pyproject.toml | 1 + 5 files changed, 103 insertions(+), 50 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index 7f0f20bf..c85a04f3 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import ( AcceleratorOptions, EasyOcrOptions, OcrOptions, + PaginatedPipelineOptions, PdfBackend, + PdfPipeline, PdfPipelineOptions, TableFormerMode, + VlmModelType, + VlmPipelineOptions, + granite_vision_vlm_conversion_options, + smoldocling_vlm_conversion_options, + smoldocling_vlm_mlx_conversion_options, ) from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.models.factories import get_ocr_factory +from docling.pipeline.vlm_pipeline import VlmPipeline warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") @@ -200,6 +208,14 @@ def convert( help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.", ), ] = ImageRefMode.EMBEDDED, + pipeline: Annotated[ + PdfPipeline, + typer.Option(..., help="Choose the pipeline to process PDF or image files."), + ] = PdfPipeline.STANDARD, + vlm_model: Annotated[ + VlmModelType, + typer.Option(..., help="Choose the VLM model to use with PDF or image files."), + ] = VlmModelType.SMOLDOCLING, ocr: Annotated[ bool, typer.Option( @@ -420,50 +436,77 @@ def convert( ocr_options.lang = ocr_lang_list accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) - pipeline_options = PdfPipelineOptions( - allow_external_plugins=allow_external_plugins, - enable_remote_services=enable_remote_services, - accelerator_options=accelerator_options, - do_ocr=ocr, - ocr_options=ocr_options, - do_table_structure=True, - do_code_enrichment=enrich_code, - do_formula_enrichment=enrich_formula, - do_picture_description=enrich_picture_description, - do_picture_classification=enrich_picture_classes, - document_timeout=document_timeout, - ) - pipeline_options.table_structure_options.do_cell_matching = ( - True # do_cell_matching - ) - pipeline_options.table_structure_options.mode = table_mode + pipeline_options: PaginatedPipelineOptions - if image_export_mode != ImageRefMode.PLACEHOLDER: - pipeline_options.generate_page_images = True - pipeline_options.generate_picture_images = ( - True # FIXME: to be deprecated in verson 3 + if pipeline == PdfPipeline.STANDARD: + pipeline_options = PdfPipelineOptions( + allow_external_plugins=allow_external_plugins, + enable_remote_services=enable_remote_services, + accelerator_options=accelerator_options, + do_ocr=ocr, + ocr_options=ocr_options, + do_table_structure=True, + do_code_enrichment=enrich_code, + do_formula_enrichment=enrich_formula, + do_picture_description=enrich_picture_description, + do_picture_classification=enrich_picture_classes, + document_timeout=document_timeout, + ) + pipeline_options.table_structure_options.do_cell_matching = ( + True # do_cell_matching + ) + pipeline_options.table_structure_options.mode = table_mode + + if image_export_mode != ImageRefMode.PLACEHOLDER: + pipeline_options.generate_page_images = True + pipeline_options.generate_picture_images = ( + True # FIXME: to be deprecated in verson 3 + ) + pipeline_options.images_scale = 2 + + backend: Type[PdfDocumentBackend] + if pdf_backend == PdfBackend.DLPARSE_V1: + backend = DoclingParseDocumentBackend + elif pdf_backend == PdfBackend.DLPARSE_V2: + backend = DoclingParseV2DocumentBackend + elif pdf_backend == PdfBackend.DLPARSE_V4: + backend = DoclingParseV4DocumentBackend # type: ignore + elif pdf_backend == PdfBackend.PYPDFIUM2: + backend = PyPdfiumDocumentBackend # type: ignore + else: + raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") + + pdf_format_option = PdfFormatOption( + pipeline_options=pipeline_options, + backend=backend, # pdf_backend + ) + elif pipeline == PdfPipeline.VLM: + pipeline_options = VlmPipelineOptions() + + if vlm_model == VlmModelType.GRANITE_VISION: + pipeline_options.vlm_options = granite_vision_vlm_conversion_options + elif vlm_model == VlmModelType.SMOLDOCLING: + pipeline_options.vlm_options = smoldocling_vlm_conversion_options + if sys.platform == "darwin": + try: + import mlx_vlm + + pipeline_options.vlm_options = ( + smoldocling_vlm_mlx_conversion_options + ) + except ImportError: + _log.warning( + "To run SmolDocling faster, please install mlx-vlm:\n" + "pip install mlx-vlm" + ) + + pdf_format_option = PdfFormatOption( + pipeline_cls=VlmPipeline, pipeline_options=pipeline_options ) - pipeline_options.images_scale = 2 if artifacts_path is not None: pipeline_options.artifacts_path = artifacts_path - backend: Type[PdfDocumentBackend] - if pdf_backend == PdfBackend.DLPARSE_V1: - backend = DoclingParseDocumentBackend - elif pdf_backend == PdfBackend.DLPARSE_V2: - backend = DoclingParseV2DocumentBackend - elif pdf_backend == PdfBackend.DLPARSE_V4: - backend = DoclingParseV4DocumentBackend # type: ignore - elif pdf_backend == PdfBackend.PYPDFIUM2: - backend = PyPdfiumDocumentBackend # type: ignore - else: - raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") - - pdf_format_option = PdfFormatOption( - pipeline_options=pipeline_options, - backend=backend, # pdf_backend - ) format_options: Dict[InputFormat, FormatOption] = { InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option, diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index c96603c4..654e04df 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -308,6 +308,11 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( ) +class VlmModelType(str, Enum): + SMOLDOCLING = "smoldocling" + GRANITE_VISION = "granite_vision" + + # Define an enum for the backend options class PdfBackend(str, Enum): """Enum of valid PDF backends.""" @@ -343,13 +348,14 @@ class PipelineOptions(BaseModel): class PaginatedPipelineOptions(PipelineOptions): + artifacts_path: Optional[Union[Path, str]] = None + images_scale: float = 1.0 generate_page_images: bool = False generate_picture_images: bool = False class VlmPipelineOptions(PaginatedPipelineOptions): - artifacts_path: Optional[Union[Path, str]] = None generate_page_images: bool = True force_backend_text: bool = ( @@ -362,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions): class PdfPipelineOptions(PaginatedPipelineOptions): """Options for the PDF pipeline.""" - artifacts_path: Optional[Union[Path, str]] = None do_table_structure: bool = True # True: perform table structure extraction do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_code_enrichment: bool = False # True: perform code OCR @@ -393,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions): ) generate_parsed_pages: bool = False + + +class PdfPipeline(str, Enum): + STANDARD = "standard" + VLM = "vlm" diff --git a/docling/models/hf_mlx_model.py b/docling/models/hf_mlx_model.py index 1c4577ac..762a6557 100644 --- a/docling/models/hf_mlx_model.py +++ b/docling/models/hf_mlx_model.py @@ -32,9 +32,15 @@ class HuggingFaceMlxModel(BasePageModel): self.vlm_options = vlm_options if self.enabled: - from mlx_vlm import generate, load # type: ignore - from mlx_vlm.prompt_utils import apply_chat_template # type: ignore - from mlx_vlm.utils import load_config, stream_generate # type: ignore + + try: + from mlx_vlm import generate, load # type: ignore + from mlx_vlm.prompt_utils import apply_chat_template # type: ignore + from mlx_vlm.utils import load_config, stream_generate # type: ignore + except ImportError: + raise ImportError( + "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models." + ) repo_cache_folder = vlm_options.repo_id.replace("/", "--") self.apply_chat_template = apply_chat_template @@ -113,7 +119,6 @@ class HuggingFaceMlxModel(BasePageModel): verbose=False, ): output += token.text - print(token.text, end="") if "" in token.text: break diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index d01ee7f1..5237f2ae 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -34,12 +34,6 @@ class VlmPipeline(PaginatedPipeline): super().__init__(pipeline_options) self.keep_backend = True - warnings.warn( - "The VlmPipeline is currently experimental and may change in upcoming versions without notice.", - category=UserWarning, - stacklevel=2, - ) - self.pipeline_options: VlmPipelineOptions artifacts_path: Optional[Path] = None diff --git a/pyproject.toml b/pyproject.toml index 0f85915f..8d121d36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -192,6 +192,7 @@ module = [ "docling_ibm_models.*", "easyocr.*", "ocrmac.*", + "mlx_vlm.*", "lxml.*", "huggingface_hub.*", "transformers.*",