Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.

2026-01-19 00:29:44 +02:00
parent 0a8910fff8
commit 362108f4b0
34 changed files with 4593 additions and 73 deletions
--- a/stt-parakeet/test_vad_client.py
+++ b/stt-parakeet/test_vad_client.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Test client for VAD-enabled server
+Simulates Discord bot audio streaming with speech detection
+"""
+import asyncio
+import websockets
+import json
+import numpy as np
+import soundfile as sf
+import sys
+
+
+async def test_vad_server(audio_file="test.wav"):
+    """Test VAD server with audio file."""
+    uri = "ws://localhost:8766"
+    
+    print(f"Connecting to {uri}...")
+    
+    try:
+        async with websockets.connect(uri) as websocket:
+            print("✓ Connected!\n")
+            
+            # Receive welcome message
+            message = await websocket.recv()
+            data = json.loads(message)
+            print(f"Server says: {data.get('message')}")
+            print(f"VAD enabled: {data.get('vad_enabled')}\n")
+            
+            # Load audio file
+            print(f"Loading audio: {audio_file}")
+            audio, sr = sf.read(audio_file, dtype='float32')
+            
+            if audio.ndim > 1:
+                audio = audio[:, 0]  # Mono
+            
+            print(f"Duration: {len(audio)/sr:.2f}s")
+            print(f"Sample rate: {sr} Hz\n")
+            
+            # Convert to int16
+            audio_int16 = (audio * 32767).astype(np.int16)
+            
+            # Listen for responses in background
+            async def receive_messages():
+                """Receive and display server messages."""
+                try:
+                    while True:
+                        response = await websocket.recv()
+                        result = json.loads(response)
+                        
+                        msg_type = result.get('type')
+                        
+                        if msg_type == 'vad_status':
+                            is_speech = result.get('is_speech')
+                            if is_speech:
+                                print("\n🎤 VAD: Speech detected\n")
+                            else:
+                                print("\n🛑 VAD: Speech ended\n")
+                        
+                        elif msg_type == 'transcript':
+                            text = result.get('text', '')
+                            duration = result.get('duration', 0)
+                            is_final = result.get('is_final', False)
+                            
+                            if is_final:
+                                print(f"\n{'='*70}")
+                                print(f"✅ FINAL TRANSCRIPTION ({duration:.2f}s):")
+                                print(f"   \"{text}\"")
+                                print(f"{'='*70}\n")
+                            else:
+                                print(f"📝 PARTIAL ({duration:.2f}s): {text}")
+                        
+                        elif msg_type == 'info':
+                            print(f"ℹ️  {result.get('message')}")
+                        
+                        elif msg_type == 'error':
+                            print(f"❌ Error: {result.get('message')}")
+                
+                except Exception as e:
+                    pass
+            
+            # Start listener
+            listen_task = asyncio.create_task(receive_messages())
+            
+            # Send audio in small chunks (simulate streaming)
+            chunk_size = int(sr * 0.1)  # 100ms chunks
+            print("Streaming audio...\n")
+            
+            for i in range(0, len(audio_int16), chunk_size):
+                chunk = audio_int16[i:i+chunk_size]
+                await websocket.send(chunk.tobytes())
+                await asyncio.sleep(0.05)  # Simulate real-time
+            
+            print("\nAll audio sent. Waiting for final transcription...")
+            
+            # Wait for processing
+            await asyncio.sleep(3.0)
+            
+            # Force transcribe any remaining buffer
+            print("Sending force_transcribe command...\n")
+            await websocket.send(json.dumps({"type": "force_transcribe"}))
+            
+            # Wait a bit more
+            await asyncio.sleep(2.0)
+            
+            # Cancel listener
+            listen_task.cancel()
+            try:
+                await listen_task
+            except asyncio.CancelledError:
+                pass
+            
+            print("\n✓ Test completed!")
+    
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return 1
+    
+    return 0
+
+
+if __name__ == "__main__":
+    audio_file = sys.argv[1] if len(sys.argv) > 1 else "test.wav"
+    exit_code = asyncio.run(test_vad_server(audio_file))
+    sys.exit(exit_code)