Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.

2026-01-19 00:29:44 +02:00
parent 0a8910fff8
commit 362108f4b0
34 changed files with 4593 additions and 73 deletions
--- a/stt-parakeet/test_client.py
+++ b/stt-parakeet/test_client.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+Simple WebSocket client to test the ASR server
+Sends a test audio file to the server
+"""
+import asyncio
+import websockets
+import json
+import sys
+import soundfile as sf
+import numpy as np
+
+
+async def test_connection(audio_file="test.wav"):
+    """Test connection to ASR server."""
+    uri = "ws://localhost:8766"
+    
+    print(f"Connecting to {uri}...")
+    
+    try:
+        async with websockets.connect(uri) as websocket:
+            print("Connected!")
+            
+            # Receive welcome message
+            message = await websocket.recv()
+            data = json.loads(message)
+            print(f"Server: {data}")
+            
+            # Load audio file
+            print(f"\nLoading audio file: {audio_file}")
+            audio, sr = sf.read(audio_file, dtype='float32')
+            
+            if audio.ndim > 1:
+                audio = audio[:, 0]  # Convert to mono
+            
+            print(f"Sample rate: {sr} Hz")
+            print(f"Duration: {len(audio)/sr:.2f} seconds")
+            
+            # Convert to int16 for sending
+            audio_int16 = (audio * 32767).astype(np.int16)
+            
+            # Send audio in chunks
+            chunk_size = int(sr * 0.5)  # 0.5 second chunks
+            
+            print("\nSending audio...")
+            
+            # Send all audio chunks
+            for i in range(0, len(audio_int16), chunk_size):
+                chunk = audio_int16[i:i+chunk_size]
+                await websocket.send(chunk.tobytes())
+                print(f"Sent chunk {i//chunk_size + 1}", end='\r')
+            
+            print("\nAll chunks sent. Sending final command...")
+            
+            # Send final command
+            await websocket.send(json.dumps({"type": "final"}))
+            
+            # Now receive ALL responses
+            print("\nWaiting for transcriptions...\n")
+            timeout_count = 0
+            while timeout_count < 3:  # Wait for 3 timeouts (6 seconds total) before giving up
+                try:
+                    response = await asyncio.wait_for(websocket.recv(), timeout=2.0)
+                    result = json.loads(response)
+                    if result.get('type') == 'transcript':
+                        text = result.get('text', '')
+                        is_final = result.get('is_final', False)
+                        prefix = "→ FINAL:" if is_final else "→ Progressive:"
+                        print(f"{prefix} {text}\n")
+                        timeout_count = 0  # Reset timeout counter when we get a message
+                        if is_final:
+                            break
+                except asyncio.TimeoutError:
+                    timeout_count += 1
+            
+            print("\nTest completed!")
+            
+    except Exception as e:
+        print(f"Error: {e}")
+        return 1
+    
+    return 0
+
+
+if __name__ == "__main__":
+    audio_file = sys.argv[1] if len(sys.argv) > 1 else "test.wav"
+    exit_code = asyncio.run(test_connection(audio_file))
+    sys.exit(exit_code)