""" Silero VAD wrapper using onnx-asr library """ import numpy as np import onnx_asr from typing import Optional, Tuple import logging logger = logging.getLogger(__name__) class SileroVAD: """ Voice Activity Detection using Silero VAD via onnx-asr. """ def __init__( self, providers: Optional[list] = None, threshold: float = 0.5, min_speech_duration_ms: int = 250, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, ): """ Initialize Silero VAD. Args: providers: Optional ONNX runtime providers threshold: Speech probability threshold (0.0-1.0) min_speech_duration_ms: Minimum duration of speech segment min_silence_duration_ms: Minimum duration of silence to split segments window_size_samples: Window size for VAD processing speech_pad_ms: Padding around speech segments """ if providers is None: providers = [ "CUDAExecutionProvider", "CPUExecutionProvider", ] logger.info("Loading Silero VAD model...") self.vad = onnx_asr.load_vad("silero", providers=providers) # VAD parameters self.threshold = threshold self.min_speech_duration_ms = min_speech_duration_ms self.min_silence_duration_ms = min_silence_duration_ms self.window_size_samples = window_size_samples self.speech_pad_ms = speech_pad_ms logger.info("Silero VAD initialized successfully") def detect_speech( self, audio: np.ndarray, sample_rate: int = 16000, ) -> list: """ Detect speech segments in audio. Args: audio: Audio data as numpy array (float32) sample_rate: Sample rate of audio Returns: List of tuples (start_sample, end_sample) for speech segments """ # Note: The actual VAD processing is typically done within # the onnx_asr model.with_vad() method, but we provide # this interface for direct VAD usage # For direct VAD detection, you would use the vad model directly # However, onnx-asr integrates VAD into the recognition pipeline # So this is mainly for compatibility logger.warning("Direct VAD detection - consider using model.with_vad() instead") return [] def is_speech( self, audio_chunk: np.ndarray, sample_rate: int = 16000, ) -> Tuple[bool, float]: """ Check if audio chunk contains speech. Args: audio_chunk: Audio chunk as numpy array (float32) sample_rate: Sample rate Returns: Tuple of (is_speech: bool, probability: float) """ # Placeholder for direct VAD probability check # In practice, use model.with_vad() for automatic segmentation logger.warning("Direct speech detection not implemented - use model.with_vad()") return False, 0.0 def get_vad(self): """ Get the underlying onnx_asr VAD model. Returns: The onnx_asr VAD model instance """ return self.vad # Convenience function def load_vad(**kwargs): """Load and return Silero VAD with given configuration.""" return SileroVAD(**kwargs)