#!/usr/bin/env python3
"""
Test script for STT WebSocket server.
Sends test audio and receives VAD/transcription events.
"""

import asyncio
import websockets
import numpy as np
import json
import wave


async def test_websocket():
    """Test STT WebSocket with generated audio."""
    
    uri = "ws://localhost:8001/ws/stt/test_user"
    
    print("🔌 Connecting to STT WebSocket...")
    
    async with websockets.connect(uri) as websocket:
        # Wait for ready message
        ready_msg = await websocket.recv()
        ready = json.loads(ready_msg)
        print(f"✅ {ready}")
        
        # Generate test audio: 2 seconds of 440Hz tone (A note)
        # This simulates speech-like audio
        print("\n🎵 Generating test audio (2 seconds, 440Hz tone)...")
        sample_rate = 16000
        duration = 2.0
        frequency = 440  # A4 note
        
        t = np.linspace(0, duration, int(sample_rate * duration), False)
        audio = np.sin(frequency * 2 * np.pi * t)
        
        # Convert to int16
        audio_int16 = (audio * 32767).astype(np.int16)
        
        # Send in 20ms chunks (320 samples at 16kHz)
        chunk_size = 320  # 20ms chunks
        total_chunks = len(audio_int16) // chunk_size
        
        print(f"📤 Sending {total_chunks} audio chunks (20ms each)...\n")
        
        # Send chunks and receive events
        for i in range(0, len(audio_int16), chunk_size):
            chunk = audio_int16[i:i+chunk_size]
            
            # Send audio chunk
            await websocket.send(chunk.tobytes())
            
            # Try to receive events (non-blocking)
            try:
                response = await asyncio.wait_for(
                    websocket.recv(), 
                    timeout=0.01
                )
                event = json.loads(response)
                
                # Print VAD events
                if event['type'] == 'vad':
                    emoji = "🟢" if event['speaking'] else "⚪"
                    print(f"{emoji} VAD: {event['event']} "
                          f"(prob={event['probability']:.3f}, "
                          f"t={event['timestamp']:.1f}ms)")
                
                # Print transcription events
                elif event['type'] == 'partial':
                    print(f"📝 Partial: \"{event['text']}\"")
                
                elif event['type'] == 'final':
                    print(f"✅ Final: \"{event['text']}\"")
                
                elif event['type'] == 'interruption':
                    print(f"⚠️  Interruption detected! (prob={event['probability']:.3f})")
                
            except asyncio.TimeoutError:
                pass  # No event yet
            
            # Small delay between chunks
            await asyncio.sleep(0.02)
        
        print("\n✅ Test audio sent successfully!")
        
        # Wait a bit for final transcription
        print("⏳ Waiting for final transcription...")
        
        for _ in range(50):  # Wait up to 1 second
            try:
                response = await asyncio.wait_for(
                    websocket.recv(), 
                    timeout=0.02
                )
                event = json.loads(response)
                
                if event['type'] == 'final':
                    print(f"\n✅ FINAL TRANSCRIPT: \"{event['text']}\"")
                    break
                elif event['type'] == 'vad':
                    emoji = "🟢" if event['speaking'] else "⚪"
                    print(f"{emoji} VAD: {event['event']} (prob={event['probability']:.3f})")
            except asyncio.TimeoutError:
                pass
        
        print("\n✅ WebSocket test complete!")


async def test_with_sample_audio():
    """Test with actual speech audio file (if available)."""
    
    import sys
    import os
    
    if len(sys.argv) > 1 and os.path.exists(sys.argv[1]):
        audio_file = sys.argv[1]
        print(f"📂 Loading audio from: {audio_file}")
        
        # Load WAV file
        with wave.open(audio_file, 'rb') as wav:
            sample_rate = wav.getframerate()
            n_channels = wav.getnchannels()
            audio_data = wav.readframes(wav.getnframes())
            
            # Convert to numpy array
            audio_np = np.frombuffer(audio_data, dtype=np.int16)
            
            # If stereo, convert to mono
            if n_channels == 2:
                audio_np = audio_np.reshape(-1, 2).mean(axis=1).astype(np.int16)
            
            # Resample to 16kHz if needed
            if sample_rate != 16000:
                print(f"⚠️  Resampling from {sample_rate}Hz to 16000Hz...")
                import librosa
                audio_float = audio_np.astype(np.float32) / 32768.0
                audio_resampled = librosa.resample(
                    audio_float, 
                    orig_sr=sample_rate, 
                    target_sr=16000
                )
                audio_np = (audio_resampled * 32767).astype(np.int16)
            
            print(f"✅ Audio loaded: {len(audio_np)/16000:.2f} seconds")
        
        # Send to STT
        uri = "ws://localhost:8001/ws/stt/test_user"
        
        async with websockets.connect(uri) as websocket:
            ready_msg = await websocket.recv()
            print(f"✅ {json.loads(ready_msg)}")
            
            # Send in chunks
            chunk_size = 320  # 20ms at 16kHz
            
            for i in range(0, len(audio_np), chunk_size):
                chunk = audio_np[i:i+chunk_size]
                await websocket.send(chunk.tobytes())
                
                # Receive events
                try:
                    response = await asyncio.wait_for(
                        websocket.recv(), 
                        timeout=0.01
                    )
                    event = json.loads(response)
                    
                    if event['type'] == 'vad':
                        emoji = "🟢" if event['speaking'] else "⚪"
                        print(f"{emoji} VAD: {event['event']} (prob={event['probability']:.3f})")
                    elif event['type'] in ['partial', 'final']:
                        print(f"📝 {event['type'].title()}: \"{event['text']}\"")
                
                except asyncio.TimeoutError:
                    pass
                
                await asyncio.sleep(0.02)
            
            # Wait for final
            for _ in range(100):
                try:
                    response = await asyncio.wait_for(websocket.recv(), timeout=0.02)
                    event = json.loads(response)
                    if event['type'] == 'final':
                        print(f"\n✅ FINAL: \"{event['text']}\"")
                        break
                except asyncio.TimeoutError:
                    pass


if __name__ == "__main__":
    import sys
    
    print("=" * 60)
    print("  Miku STT WebSocket Test")
    print("=" * 60)
    print()
    
    if len(sys.argv) > 1:
        print("📁 Testing with audio file...")
        asyncio.run(test_with_sample_audio())
    else:
        print("🎵 Testing with generated tone...")
        print("   (To test with audio file: python test_stt.py audio.wav)")
        print()
        asyncio.run(test_websocket())