From 5c606c25740fa38c472ef87f9a98f97fce1e8e33 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Thu, 12 Jun 2025 17:57:29 +0200 Subject: [PATCH] scaffolding in place Signed-off-by: Peter Staar --- docling/cli/main.py | 35 +++++++++-- docling/datamodel/asr_model_specs.py | 27 ++++++++ docling/datamodel/base_models.py | 3 + docling/datamodel/pipeline_options.py | 15 ++++- .../datamodel/pipeline_options_asr_model.py | 50 +++++++++++++++ docling/pipeline/asr_pipeline.py | 62 +++++++++++++++++++ 6 files changed, 186 insertions(+), 6 deletions(-) create mode 100644 docling/datamodel/asr_model_specs.py create mode 100644 docling/datamodel/pipeline_options_asr_model.py create mode 100644 docling/pipeline/asr_pipeline.py diff --git a/docling/cli/main.py b/docling/cli/main.py index 083f53b2..3e3796cf 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -29,6 +29,10 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions +from docling.datamodel.asr_model_specs import ( + WHISPER_TINY, + AsrModelType, +) from docling.datamodel.base_models import ( ConversionStatus, FormatToExtensions, @@ -37,12 +41,13 @@ from docling.datamodel.base_models import ( ) from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( + AsrPipelineOptions, EasyOcrOptions, OcrOptions, PaginatedPipelineOptions, PdfBackend, - PdfPipeline, PdfPipelineOptions, + ProcessingPipeline, TableFormerMode, VlmPipelineOptions, ) @@ -56,6 +61,7 @@ from docling.datamodel.vlm_model_specs import ( ) from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.models.factories import get_ocr_factory +from docling.pipeline.asr_pipeline import AsrPipeline from docling.pipeline.vlm_pipeline import VlmPipeline warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") @@ -296,13 +302,17 @@ def convert( # noqa: C901 ), ] = ImageRefMode.EMBEDDED, pipeline: Annotated[ - PdfPipeline, + ProcessingPipeline, typer.Option(..., help="Choose the pipeline to process PDF or image files."), - ] = PdfPipeline.STANDARD, + ] = ProcessingPipeline.STANDARD, vlm_model: Annotated[ VlmModelType, typer.Option(..., help="Choose the VLM model to use with PDF or image files."), ] = VlmModelType.SMOLDOCLING, + asr_model: Annotated[ + AsrModelType, + typer.Option(..., help="Choose the ASR model to use with audio/video files."), + ] = AsrModelType.WHISPER_TINY, ocr: Annotated[ bool, typer.Option( @@ -532,7 +542,7 @@ def convert( # noqa: C901 accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) pipeline_options: PaginatedPipelineOptions - if pipeline == PdfPipeline.STANDARD: + if pipeline == ProcessingPipeline.STANDARD: pipeline_options = PdfPipelineOptions( allow_external_plugins=allow_external_plugins, enable_remote_services=enable_remote_services, @@ -574,7 +584,7 @@ def convert( # noqa: C901 pipeline_options=pipeline_options, backend=backend, # pdf_backend ) - elif pipeline == PdfPipeline.VLM: + elif pipeline == ProcessingPipeline.VLM: pipeline_options = VlmPipelineOptions( enable_remote_services=enable_remote_services, ) @@ -599,13 +609,28 @@ def convert( # noqa: C901 pdf_format_option = PdfFormatOption( pipeline_cls=VlmPipeline, pipeline_options=pipeline_options ) + elif pipeline == ProcessingPipeline.ASR: + audio_pipeline_options = AsrPipelineOptions( + # enable_remote_services=enable_remote_services, + ) + + audio_format_option = PdfFormatOption( + pipeline_cls=AsrPipeline, pipeline_options=audio_pipeline_options + ) + + """ + if asr_model == AsrModelType.WHISPER_TINY: + pipeline_options.asr_options = WHISPER_TINY: + """ if artifacts_path is not None: pipeline_options.artifacts_path = artifacts_path + # audio_pipeline_options.artifacts_path = artifacts_path format_options: Dict[InputFormat, FormatOption] = { InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option, + InputFormat.AUDIO: audio_format_option, } doc_converter = DocumentConverter( allowed_formats=from_formats, diff --git a/docling/datamodel/asr_model_specs.py b/docling/datamodel/asr_model_specs.py new file mode 100644 index 00000000..71daa8a8 --- /dev/null +++ b/docling/datamodel/asr_model_specs.py @@ -0,0 +1,27 @@ +import logging +from enum import Enum + +from pydantic import ( + AnyUrl, +) + +from docling.datamodel.accelerator_options import AcceleratorDevice +from docling.datamodel.pipeline_options_asr_model import ( + # ApiAsrOptions, + InferenceFramework, + InlineAsrOptions, + AsrResponseFormat, + TransformersModelType, +) + +_log = logging.getLogger(__name__) + +# SmolDocling +WHISPER_TINY = InlineAsrOptions( + repo_id="openai/whisper-tiny", + inference_framework=InferenceFramework.TRANSFORMERS, + response_format = AsrResponseFormat.WHISPER, +) + +class AsrModelType(str, Enum): + WHISPER_TINY = "whisper_tiny" diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 2cd25150..3eb88548 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -49,6 +49,7 @@ class InputFormat(str, Enum): XML_USPTO = "xml_uspto" XML_JATS = "xml_jats" JSON_DOCLING = "json_docling" + AUDIO = "audio" class OutputFormat(str, Enum): @@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.XLSX: ["xlsx", "xlsm"], InputFormat.XML_USPTO: ["xml", "txt"], InputFormat.JSON_DOCLING: ["json"], + InputFormat.AUDIO: ["wav", "mp3"], } FormatToMimeType: Dict[InputFormat, List[str]] = { @@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { ], InputFormat.XML_USPTO: ["application/xml", "text/plain"], InputFormat.JSON_DOCLING: ["application/json"], + InputFormat.AUDIO: ["audio/wav"], } MimeTypeToFormat: dict[str, list[InputFormat]] = { diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index cde5cd33..302afcdf 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -13,6 +13,13 @@ from typing_extensions import deprecated # Import the following for backwards compatibility from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions +from docling.datamodel.asr_model_specs import ( + WHISPER_TINY as whisper_tiny, + AsrModelType, +) +from docling.datamodel.pipeline_options_asr_model import ( + InlineAsrOptions, +) from docling.datamodel.pipeline_options_vlm_model import ( ApiVlmOptions, InferenceFramework, @@ -260,6 +267,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions): ) +class AsrPipelineOptions(PipelineOptions): + asr_options: Union[InlineAsrOptions] = whisper_tiny + artifacts_path: Optional[Union[Path, str]] = None + + class PdfPipelineOptions(PaginatedPipelineOptions): """Options for the PDF pipeline.""" @@ -295,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions): generate_parsed_pages: bool = False -class PdfPipeline(str, Enum): +class ProcessingPipeline(str, Enum): STANDARD = "standard" VLM = "vlm" + ASR = "asr" diff --git a/docling/datamodel/pipeline_options_asr_model.py b/docling/datamodel/pipeline_options_asr_model.py new file mode 100644 index 00000000..25d0dd15 --- /dev/null +++ b/docling/datamodel/pipeline_options_asr_model.py @@ -0,0 +1,50 @@ +from enum import Enum +from typing import Any, Dict, List, Literal, Optional, Union + +from pydantic import AnyUrl, BaseModel +from typing_extensions import deprecated + +from docling.datamodel.accelerator_options import AcceleratorDevice +from docling.datamodel.pipeline_options_vlm_model import InferenceFramework, TransformersModelType + +class BaseAsrOptions(BaseModel): + kind: str + # prompt: str + + +class AsrResponseFormat(str, Enum): + WHISPER = "whisper" + + +class InlineAsrOptions(BaseAsrOptions): + kind: Literal["inline_model_options"] = "inline_model_options" + + repo_id: str + trust_remote_code: bool = False + load_in_8bit: bool = True + llm_int8_threshold: float = 6.0 + quantized: bool = False + + inference_framework: InferenceFramework + transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL + response_format: AsrResponseFormat + + torch_dtype: Optional[str] = None + supported_devices: List[AcceleratorDevice] = [ + AcceleratorDevice.CPU, + AcceleratorDevice.CUDA, + AcceleratorDevice.MPS, + ] + + temperature: float = 0.0 + stop_strings: List[str] = [] + extra_generation_config: Dict[str, Any] = {} + + use_kv_cache: bool = True + max_new_tokens: int = 4096 + + @property + def repo_cache_folder(self) -> str: + return self.repo_id.replace("/", "--") + + diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py new file mode 100644 index 00000000..70e52537 --- /dev/null +++ b/docling/pipeline/asr_pipeline.py @@ -0,0 +1,62 @@ +import logging +import re +from io import BytesIO +from pathlib import Path +from typing import List, Optional, Union, cast + +from docling.backend.abstract_backend import AbstractDocumentBackend + +from docling.datamodel.document import ConversionResult, InputDocument +from docling.datamodel.pipeline_options import ( + AsrPipelineOptions, +) +from docling.datamodel.pipeline_options_vlm_model import ( + InferenceFramework, +) +from docling.datamodel.pipeline_options_asr_model import ( + InlineAsrOptions, + AsrResponseFormat, +) +from docling.datamodel.settings import settings +from docling.pipeline.base_pipeline import BasePipeline +from docling.utils.profiling import ProfilingScope, TimeRecorder +from docling.datamodel.document import ConversionResult, InputDocument + +_log = logging.getLogger(__name__) + + +class AsrPipeline(BasePipeline): + def __init__(self, pipeline_options: AsrPipelineOptions): + super().__init__(pipeline_options) + self.keep_backend = True + + self.pipeline_options: AsrPipelineOptions + + artifacts_path: Optional[Path] = None + if pipeline_options.artifacts_path is not None: + artifacts_path = Path(pipeline_options.artifacts_path).expanduser() + elif settings.artifacts_path is not None: + artifacts_path = Path(settings.artifacts_path).expanduser() + + if artifacts_path is not None and not artifacts_path.is_dir(): + raise RuntimeError( + f"The value of {artifacts_path=} is not valid. " + "When defined, it must point to a folder containing all models required by the pipeline." + ) + + def _build_document(self, conv_res: ConversionResult) -> ConversionResult: + total_elapsed_time = 0.0 + with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): + print("do something") + + return conv_res + + """ + def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: + status = ConversionStatus() + return status + """ + + @classmethod + def is_backend_supported(cls, backend: AbstractDocumentBackend): + return True