doing scaffolding for audio pipeline

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2025-06-12 18:24:13 +02:00
parent 5c606c2574
commit 1d4008ac7c
3 changed files with 57 additions and 35 deletions

View File

@ -542,7 +542,41 @@ def convert( # noqa: C901
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
pipeline_options: PaginatedPipelineOptions pipeline_options: PaginatedPipelineOptions
if pipeline == ProcessingPipeline.STANDARD: format_options: Dict[InputFormat, FormatOption] = {}
if pipeline == ProcessingPipeline.VLM:
pipeline_options = VlmPipelineOptions(
enable_remote_services=enable_remote_services,
)
if vlm_model == VlmModelType.GRANITE_VISION:
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
elif vlm_model == VlmModelType.SMOLDOCLING:
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
if sys.platform == "darwin":
try:
import mlx_vlm
pipeline_options.vlm_options = SMOLDOCLING_MLX
except ImportError:
_log.warning(
"To run SmolDocling faster, please install mlx-vlm:\n"
"pip install mlx-vlm"
)
pdf_format_option = PdfFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
elif pipeline == ProcessingPipeline.STANDARD:
pipeline_options = PdfPipelineOptions( pipeline_options = PdfPipelineOptions(
allow_external_plugins=allow_external_plugins, allow_external_plugins=allow_external_plugins,
enable_remote_services=enable_remote_services, enable_remote_services=enable_remote_services,
@ -584,40 +618,28 @@ def convert( # noqa: C901
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
backend=backend, # pdf_backend backend=backend, # pdf_backend
) )
elif pipeline == ProcessingPipeline.VLM:
pipeline_options = VlmPipelineOptions(
enable_remote_services=enable_remote_services,
)
if vlm_model == VlmModelType.GRANITE_VISION: format_options: Dict[InputFormat, FormatOption] = {
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS InputFormat.PDF: pdf_format_option,
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA: InputFormat.IMAGE: pdf_format_option,
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA }
elif vlm_model == VlmModelType.SMOLDOCLING:
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
if sys.platform == "darwin":
try:
import mlx_vlm
pipeline_options.vlm_options = SMOLDOCLING_MLX
except ImportError:
_log.warning(
"To run SmolDocling faster, please install mlx-vlm:\n"
"pip install mlx-vlm"
)
pdf_format_option = PdfFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)
elif pipeline == ProcessingPipeline.ASR: elif pipeline == ProcessingPipeline.ASR:
audio_pipeline_options = AsrPipelineOptions( audio_pipeline_options = AsrPipelineOptions(
# enable_remote_services=enable_remote_services, # enable_remote_services=enable_remote_services,
artifacts_path = artifacts_path
) )
audio_format_option = PdfFormatOption( audio_format_option = PdfFormatOption(
pipeline_cls=AsrPipeline, pipeline_options=audio_pipeline_options pipeline_cls=AsrPipeline,
pipeline_options=audio_pipeline_options,
# backend = FIXME
) )
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.AUDIO_WAV: audio_format_option,
}
""" """
if asr_model == AsrModelType.WHISPER_TINY: if asr_model == AsrModelType.WHISPER_TINY:
pipeline_options.asr_options = WHISPER_TINY: pipeline_options.asr_options = WHISPER_TINY:
@ -627,11 +649,6 @@ def convert( # noqa: C901
pipeline_options.artifacts_path = artifacts_path pipeline_options.artifacts_path = artifacts_path
# audio_pipeline_options.artifacts_path = artifacts_path # audio_pipeline_options.artifacts_path = artifacts_path
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
InputFormat.AUDIO: audio_format_option,
}
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
allowed_formats=from_formats, allowed_formats=from_formats,
format_options=format_options, format_options=format_options,

View File

@ -49,7 +49,7 @@ class InputFormat(str, Enum):
XML_USPTO = "xml_uspto" XML_USPTO = "xml_uspto"
XML_JATS = "xml_jats" XML_JATS = "xml_jats"
JSON_DOCLING = "json_docling" JSON_DOCLING = "json_docling"
AUDIO = "audio" AUDIO_WAV = "wav"
class OutputFormat(str, Enum): class OutputFormat(str, Enum):
@ -74,7 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.XLSX: ["xlsx", "xlsm"], InputFormat.XLSX: ["xlsx", "xlsm"],
InputFormat.XML_USPTO: ["xml", "txt"], InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"], InputFormat.JSON_DOCLING: ["json"],
InputFormat.AUDIO: ["wav", "mp3"], InputFormat.AUDIO_WAV: ["wav"],
} }
FormatToMimeType: Dict[InputFormat, List[str]] = { FormatToMimeType: Dict[InputFormat, List[str]] = {
@ -106,7 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
], ],
InputFormat.XML_USPTO: ["application/xml", "text/plain"], InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"], InputFormat.JSON_DOCLING: ["application/json"],
InputFormat.AUDIO: ["audio/wav"], InputFormat.AUDIO_WAV: ["audio/wav", "audio/x-wav"],
} }
MimeTypeToFormat: dict[str, list[InputFormat]] = { MimeTypeToFormat: dict[str, list[InputFormat]] = {

View File

@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
backend: Type[AbstractDocumentBackend] backend: Type[AbstractDocumentBackend]
if format not in format_options.keys(): if format not in format_options.keys():
_log.error( _log.error(
f"Input document {obj.name} does not match any allowed format." f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
) )
backend = _DummyBackend backend = _DummyBackend
else: else:
@ -280,9 +280,12 @@ class _DocumentConversionInput(BaseModel):
if isinstance(obj, Path): if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj)) mime = filetype.guess_mime(str(obj))
print(f"mime: {mime}")
if mime is None: if mime is None:
ext = obj.suffix[1:] ext = obj.suffix[1:]
print(f"ext: {ext}")
mime = _DocumentConversionInput._mime_from_extension(ext) mime = _DocumentConversionInput._mime_from_extension(ext)
print(f"mime: {mime}")
if mime is None: # must guess from if mime is None: # must guess from
with obj.open("rb") as f: with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB content = f.read(1024) # Read first 1KB
@ -318,6 +321,8 @@ class _DocumentConversionInput(BaseModel):
mime = mime or _DocumentConversionInput._detect_csv(content) mime = mime or _DocumentConversionInput._detect_csv(content)
mime = mime or "text/plain" mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, []) formats = MimeTypeToFormat.get(mime, [])
print(formats)
if formats: if formats:
if len(formats) == 1 and mime not in ("text/plain"): if len(formats) == 1 and mime not in ("text/plain"):
return formats[0] return formats[0]