Phase 4 STT pipeline implemented — Silero VAD + faster-whisper — still not working well at all

2026-01-17 03:14:40 +02:00
parent 3e59e5d2f6
commit d1e6b21508
30 changed files with 156595 additions and 8 deletions
--- a/bot/utils/voice_manager.py
+++ b/bot/utils/voice_manager.py
@@ -19,6 +19,7 @@ import json
 import os
 from typing import Optional
 import discord
+from discord.ext import voice_recv
 import globals
 from utils.logger import get_logger

@@ -97,12 +98,12 @@ class VoiceSessionManager:
                # 10. Create voice session
                self.active_session = VoiceSession(guild_id, voice_channel, text_channel)
                
-                # 11. Connect to Discord voice channel
+                # 11. Connect to Discord voice channel with VoiceRecvClient
                try:
-                    voice_client = await voice_channel.connect()
+                    voice_client = await voice_channel.connect(cls=voice_recv.VoiceRecvClient)
                    self.active_session.voice_client = voice_client
                    self.active_session.active = True
-                    logger.info(f"✓ Connected to voice channel: {voice_channel.name}")
+                    logger.info(f"✓ Connected to voice channel: {voice_channel.name} (with audio receiving)")
                except Exception as e:
                    logger.error(f"Failed to connect to voice channel: {e}", exc_info=True)
                    raise
@@ -387,7 +388,9 @@ class VoiceSession:
        self.voice_client: Optional[discord.VoiceClient] = None
        self.audio_source: Optional['MikuVoiceSource'] = None  # Forward reference
        self.tts_streamer: Optional['TTSTokenStreamer'] = None  # Forward reference
+        self.voice_receiver: Optional['VoiceReceiver'] = None  # STT receiver
        self.active = False
+        self.miku_speaking = False  # Track if Miku is currently speaking
        
        logger.info(f"VoiceSession created for {voice_channel.name} in guild {guild_id}")
    
@@ -433,6 +436,207 @@ class VoiceSession:
            
        except Exception as e:
            logger.error(f"Error stopping audio streaming: {e}", exc_info=True)
+    
+    async def start_listening(self, user: discord.User):
+        """
+        Start listening to a user's voice (STT).
+        
+        Args:
+            user: Discord user to listen to
+        """
+        from utils.voice_receiver import VoiceReceiverSink
+        
+        try:
+            # Create receiver if not exists
+            if not self.voice_receiver:
+                self.voice_receiver = VoiceReceiverSink(self)
+                
+                # Start receiving audio from Discord using discord-ext-voice-recv
+                if self.voice_client:
+                    self.voice_client.listen(self.voice_receiver)
+                    logger.info("✓ Discord voice receive started (discord-ext-voice-recv)")
+            
+            # Start listening to specific user
+            await self.voice_receiver.start_listening(user.id, user)
+            logger.info(f"✓ Started listening to {user.name}")
+            
+        except Exception as e:
+            logger.error(f"Failed to start listening to {user.name}: {e}", exc_info=True)
+            raise
+    
+    async def stop_listening(self, user_id: int):
+        """
+        Stop listening to a user.
+        
+        Args:
+            user_id: Discord user ID
+        """
+        if self.voice_receiver:
+            await self.voice_receiver.stop_listening(user_id)
+            logger.info(f"✓ Stopped listening to user {user_id}")
+    
+    async def stop_all_listening(self):
+        """Stop listening to all users."""
+        if self.voice_receiver:
+            await self.voice_receiver.stop_all()
+            self.voice_receiver = None
+            logger.info("✓ Stopped all listening")
+    
+    async def on_user_vad_event(self, user_id: int, event: dict):
+        """Called when VAD detects speech state change."""
+        event_type = event.get('event')
+        logger.debug(f"User {user_id} VAD: {event_type}")
+    
+    async def on_partial_transcript(self, user_id: int, text: str):
+        """Called when partial transcript is received."""
+        logger.info(f"Partial from user {user_id}: {text}")
+        # Could show "User is saying..." in chat
+    
+    async def on_final_transcript(self, user_id: int, text: str):
+        """
+        Called when final transcript is received.
+        This triggers LLM response and TTS.
+        """
+        logger.info(f"Final from user {user_id}: {text}")
+        
+        # Get user info
+        user = self.voice_channel.guild.get_member(user_id)
+        if not user:
+            logger.warning(f"User {user_id} not found in guild")
+            return
+        
+        # Show what user said
+        await self.text_channel.send(f"🎤 {user.name}: *\"{text}\"*")
+        
+        # Generate LLM response and speak it
+        await self._generate_voice_response(user, text)
+    
+    async def on_user_interruption(self, user_id: int, probability: float):
+        """
+        Called when user interrupts Miku's speech.
+        Cancel TTS and switch to listening.
+        """
+        if not self.miku_speaking:
+            return
+        
+        logger.info(f"User {user_id} interrupted Miku (prob={probability:.3f})")
+        
+        # Cancel Miku's speech
+        await self._cancel_tts()
+        
+        # Show interruption in chat
+        user = self.voice_channel.guild.get_member(user_id)
+        await self.text_channel.send(f"⚠️ *{user.name if user else 'User'} interrupted Miku*")
+    
+    async def _generate_voice_response(self, user: discord.User, text: str):
+        """
+        Generate LLM response and speak it.
+        
+        Args:
+            user: User who spoke
+            text: Transcribed text
+        """
+        try:
+            self.miku_speaking = True
+            
+            # Show processing
+            await self.text_channel.send(f"💭 *Miku is thinking...*")
+            
+            # Import here to avoid circular imports
+            from utils.llm import get_current_gpu_url
+            import aiohttp
+            import globals
+            
+            # Simple system prompt for voice
+            system_prompt = """You are Hatsune Miku, the virtual singer. 
+Respond naturally and concisely as Miku would in a voice conversation.
+Keep responses short (1-3 sentences) since they will be spoken aloud."""
+            
+            payload = {
+                "model": globals.TEXT_MODEL,
+                "messages": [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": text}
+                ],
+                "stream": True,
+                "temperature": 0.8,
+                "max_tokens": 200
+            }
+            
+            headers = {'Content-Type': 'application/json'}
+            llama_url = get_current_gpu_url()
+            
+            # Stream LLM response to TTS
+            full_response = ""
+            async with aiohttp.ClientSession() as http_session:
+                async with http_session.post(
+                    f"{llama_url}/v1/chat/completions",
+                    json=payload,
+                    headers=headers,
+                    timeout=aiohttp.ClientTimeout(total=60)
+                ) as response:
+                    if response.status != 200:
+                        error_text = await response.text()
+                        raise Exception(f"LLM error {response.status}: {error_text}")
+                    
+                    # Stream tokens to TTS
+                    async for line in response.content:
+                        if not self.miku_speaking:
+                            # Interrupted
+                            break
+                        
+                        line = line.decode('utf-8').strip()
+                        if line.startswith('data: '):
+                            data_str = line[6:]
+                            if data_str == '[DONE]':
+                                break
+                            
+                            try:
+                                import json
+                                data = json.loads(data_str)
+                                if 'choices' in data and len(data['choices']) > 0:
+                                    delta = data['choices'][0].get('delta', {})
+                                    content = delta.get('content', '')
+                                    if content:
+                                        await self.audio_source.send_token(content)
+                                        full_response += content
+                            except json.JSONDecodeError:
+                                continue
+            
+            # Flush TTS
+            if self.miku_speaking:
+                await self.audio_source.flush()
+                
+                # Show response
+                await self.text_channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
+                logger.info(f"✓ Voice response complete: {full_response.strip()}")
+            
+        except Exception as e:
+            logger.error(f"Voice response failed: {e}", exc_info=True)
+            await self.text_channel.send(f"❌ Sorry, I had trouble responding")
+        
+        finally:
+            self.miku_speaking = False
+    
+    async def _cancel_tts(self):
+        """Cancel current TTS synthesis."""
+        logger.info("Canceling TTS synthesis")
+        
+        # Stop Discord playback
+        if self.voice_client and self.voice_client.is_playing():
+            self.voice_client.stop()
+        
+        # Send interrupt to RVC
+        try:
+            import aiohttp
+            async with aiohttp.ClientSession() as session:
+                async with session.post("http://172.25.0.1:8765/interrupt") as resp:
+                    if resp.status == 200:
+                        logger.info("✓ TTS interrupted")
+        except Exception as e:
+            logger.error(f"Failed to interrupt TTS: {e}")
+        
+        self.miku_speaking = False


 # Global singleton instance