Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.
This commit is contained in:
125
stt-parakeet/test_vad_client.py
Normal file
125
stt-parakeet/test_vad_client.py
Normal file
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test client for VAD-enabled server
|
||||
Simulates Discord bot audio streaming with speech detection
|
||||
"""
|
||||
import asyncio
|
||||
import websockets
|
||||
import json
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import sys
|
||||
|
||||
|
||||
async def test_vad_server(audio_file="test.wav"):
|
||||
"""Test VAD server with audio file."""
|
||||
uri = "ws://localhost:8766"
|
||||
|
||||
print(f"Connecting to {uri}...")
|
||||
|
||||
try:
|
||||
async with websockets.connect(uri) as websocket:
|
||||
print("✓ Connected!\n")
|
||||
|
||||
# Receive welcome message
|
||||
message = await websocket.recv()
|
||||
data = json.loads(message)
|
||||
print(f"Server says: {data.get('message')}")
|
||||
print(f"VAD enabled: {data.get('vad_enabled')}\n")
|
||||
|
||||
# Load audio file
|
||||
print(f"Loading audio: {audio_file}")
|
||||
audio, sr = sf.read(audio_file, dtype='float32')
|
||||
|
||||
if audio.ndim > 1:
|
||||
audio = audio[:, 0] # Mono
|
||||
|
||||
print(f"Duration: {len(audio)/sr:.2f}s")
|
||||
print(f"Sample rate: {sr} Hz\n")
|
||||
|
||||
# Convert to int16
|
||||
audio_int16 = (audio * 32767).astype(np.int16)
|
||||
|
||||
# Listen for responses in background
|
||||
async def receive_messages():
|
||||
"""Receive and display server messages."""
|
||||
try:
|
||||
while True:
|
||||
response = await websocket.recv()
|
||||
result = json.loads(response)
|
||||
|
||||
msg_type = result.get('type')
|
||||
|
||||
if msg_type == 'vad_status':
|
||||
is_speech = result.get('is_speech')
|
||||
if is_speech:
|
||||
print("\n🎤 VAD: Speech detected\n")
|
||||
else:
|
||||
print("\n🛑 VAD: Speech ended\n")
|
||||
|
||||
elif msg_type == 'transcript':
|
||||
text = result.get('text', '')
|
||||
duration = result.get('duration', 0)
|
||||
is_final = result.get('is_final', False)
|
||||
|
||||
if is_final:
|
||||
print(f"\n{'='*70}")
|
||||
print(f"✅ FINAL TRANSCRIPTION ({duration:.2f}s):")
|
||||
print(f" \"{text}\"")
|
||||
print(f"{'='*70}\n")
|
||||
else:
|
||||
print(f"📝 PARTIAL ({duration:.2f}s): {text}")
|
||||
|
||||
elif msg_type == 'info':
|
||||
print(f"ℹ️ {result.get('message')}")
|
||||
|
||||
elif msg_type == 'error':
|
||||
print(f"❌ Error: {result.get('message')}")
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
# Start listener
|
||||
listen_task = asyncio.create_task(receive_messages())
|
||||
|
||||
# Send audio in small chunks (simulate streaming)
|
||||
chunk_size = int(sr * 0.1) # 100ms chunks
|
||||
print("Streaming audio...\n")
|
||||
|
||||
for i in range(0, len(audio_int16), chunk_size):
|
||||
chunk = audio_int16[i:i+chunk_size]
|
||||
await websocket.send(chunk.tobytes())
|
||||
await asyncio.sleep(0.05) # Simulate real-time
|
||||
|
||||
print("\nAll audio sent. Waiting for final transcription...")
|
||||
|
||||
# Wait for processing
|
||||
await asyncio.sleep(3.0)
|
||||
|
||||
# Force transcribe any remaining buffer
|
||||
print("Sending force_transcribe command...\n")
|
||||
await websocket.send(json.dumps({"type": "force_transcribe"}))
|
||||
|
||||
# Wait a bit more
|
||||
await asyncio.sleep(2.0)
|
||||
|
||||
# Cancel listener
|
||||
listen_task.cancel()
|
||||
try:
|
||||
await listen_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
print("\n✓ Test completed!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
audio_file = sys.argv[1] if len(sys.argv) > 1 else "test.wav"
|
||||
exit_code = asyncio.run(test_vad_server(audio_file))
|
||||
sys.exit(exit_code)
|
||||
Reference in New Issue
Block a user