#!/usr/bin/env python3 """ Test script for STT WebSocket server. Sends test audio and receives VAD/transcription events. """ import asyncio import websockets import numpy as np import json import wave async def test_websocket(): """Test STT WebSocket with generated audio.""" uri = "ws://localhost:8001/ws/stt/test_user" print("šŸ”Œ Connecting to STT WebSocket...") async with websockets.connect(uri) as websocket: # Wait for ready message ready_msg = await websocket.recv() ready = json.loads(ready_msg) print(f"āœ… {ready}") # Generate test audio: 2 seconds of 440Hz tone (A note) # This simulates speech-like audio print("\nšŸŽµ Generating test audio (2 seconds, 440Hz tone)...") sample_rate = 16000 duration = 2.0 frequency = 440 # A4 note t = np.linspace(0, duration, int(sample_rate * duration), False) audio = np.sin(frequency * 2 * np.pi * t) # Convert to int16 audio_int16 = (audio * 32767).astype(np.int16) # Send in 20ms chunks (320 samples at 16kHz) chunk_size = 320 # 20ms chunks total_chunks = len(audio_int16) // chunk_size print(f"šŸ“¤ Sending {total_chunks} audio chunks (20ms each)...\n") # Send chunks and receive events for i in range(0, len(audio_int16), chunk_size): chunk = audio_int16[i:i+chunk_size] # Send audio chunk await websocket.send(chunk.tobytes()) # Try to receive events (non-blocking) try: response = await asyncio.wait_for( websocket.recv(), timeout=0.01 ) event = json.loads(response) # Print VAD events if event['type'] == 'vad': emoji = "🟢" if event['speaking'] else "⚪" print(f"{emoji} VAD: {event['event']} " f"(prob={event['probability']:.3f}, " f"t={event['timestamp']:.1f}ms)") # Print transcription events elif event['type'] == 'partial': print(f"šŸ“ Partial: \"{event['text']}\"") elif event['type'] == 'final': print(f"āœ… Final: \"{event['text']}\"") elif event['type'] == 'interruption': print(f"āš ļø Interruption detected! (prob={event['probability']:.3f})") except asyncio.TimeoutError: pass # No event yet # Small delay between chunks await asyncio.sleep(0.02) print("\nāœ… Test audio sent successfully!") # Wait a bit for final transcription print("ā³ Waiting for final transcription...") for _ in range(50): # Wait up to 1 second try: response = await asyncio.wait_for( websocket.recv(), timeout=0.02 ) event = json.loads(response) if event['type'] == 'final': print(f"\nāœ… FINAL TRANSCRIPT: \"{event['text']}\"") break elif event['type'] == 'vad': emoji = "🟢" if event['speaking'] else "⚪" print(f"{emoji} VAD: {event['event']} (prob={event['probability']:.3f})") except asyncio.TimeoutError: pass print("\nāœ… WebSocket test complete!") async def test_with_sample_audio(): """Test with actual speech audio file (if available).""" import sys import os if len(sys.argv) > 1 and os.path.exists(sys.argv[1]): audio_file = sys.argv[1] print(f"šŸ“‚ Loading audio from: {audio_file}") # Load WAV file with wave.open(audio_file, 'rb') as wav: sample_rate = wav.getframerate() n_channels = wav.getnchannels() audio_data = wav.readframes(wav.getnframes()) # Convert to numpy array audio_np = np.frombuffer(audio_data, dtype=np.int16) # If stereo, convert to mono if n_channels == 2: audio_np = audio_np.reshape(-1, 2).mean(axis=1).astype(np.int16) # Resample to 16kHz if needed if sample_rate != 16000: print(f"āš ļø Resampling from {sample_rate}Hz to 16000Hz...") import librosa audio_float = audio_np.astype(np.float32) / 32768.0 audio_resampled = librosa.resample( audio_float, orig_sr=sample_rate, target_sr=16000 ) audio_np = (audio_resampled * 32767).astype(np.int16) print(f"āœ… Audio loaded: {len(audio_np)/16000:.2f} seconds") # Send to STT uri = "ws://localhost:8001/ws/stt/test_user" async with websockets.connect(uri) as websocket: ready_msg = await websocket.recv() print(f"āœ… {json.loads(ready_msg)}") # Send in chunks chunk_size = 320 # 20ms at 16kHz for i in range(0, len(audio_np), chunk_size): chunk = audio_np[i:i+chunk_size] await websocket.send(chunk.tobytes()) # Receive events try: response = await asyncio.wait_for( websocket.recv(), timeout=0.01 ) event = json.loads(response) if event['type'] == 'vad': emoji = "🟢" if event['speaking'] else "⚪" print(f"{emoji} VAD: {event['event']} (prob={event['probability']:.3f})") elif event['type'] in ['partial', 'final']: print(f"šŸ“ {event['type'].title()}: \"{event['text']}\"") except asyncio.TimeoutError: pass await asyncio.sleep(0.02) # Wait for final for _ in range(100): try: response = await asyncio.wait_for(websocket.recv(), timeout=0.02) event = json.loads(response) if event['type'] == 'final': print(f"\nāœ… FINAL: \"{event['text']}\"") break except asyncio.TimeoutError: pass if __name__ == "__main__": import sys print("=" * 60) print(" Miku STT WebSocket Test") print("=" * 60) print() if len(sys.argv) > 1: print("šŸ“ Testing with audio file...") asyncio.run(test_with_sample_audio()) else: print("šŸŽµ Testing with generated tone...") print(" (To test with audio file: python test_stt.py audio.wav)") print() asyncio.run(test_websocket())