miku-discord/bot/utils/voice_manager.py

# voice_manager.py
"""
Voice session manager for Miku Discord bot.
Handles Discord voice channel connections, resource locking, and feature blocking during voice sessions.

During a voice session:
- GPU switches to AMD for text inference only
- Vision model is blocked (keeps GTX 1660 for TTS)
- Image generation is blocked
- Bipolar mode interactions are disabled
- Profile picture switching is locked
- Autonomous engine is paused
- Scheduled events are paused
- Text channels are paused (messages queued)
"""

import asyncio
import json
import os
from typing import Optional
import discord
from discord.ext import voice_recv
import globals
from utils.logger import get_logger

logger = get_logger('voice_manager')


class VoiceSessionManager:
    """
    Singleton manager for voice chat sessions.
    Ensures only one voice session active at a time and manages all resource locks.
    """

    _instance = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
            cls._instance._initialized = False
        return cls._instance

    def __init__(self):
        if self._initialized:
            return

        self.active_session: Optional['VoiceSession'] = None
        self.session_lock = asyncio.Lock()
        self._initialized = True
        logger.info("VoiceSessionManager initialized")

    async def start_session(self, guild_id: int, voice_channel: discord.VoiceChannel, text_channel: discord.TextChannel):
        """
        Start a voice session with full resource locking.

        Args:
            guild_id: Discord guild ID
            voice_channel: Voice channel to join
            text_channel: Text channel for voice prompts

        Raises:
            Exception: If session already active or resources can't be locked
        """
        async with self.session_lock:
            if self.active_session:
                raise Exception("Voice session already active")

            logger.info(f"Starting voice session in {voice_channel.name} (guild {guild_id})")

            try:
                # 1. Switch to AMD GPU for text inference
                await self._switch_to_amd_gpu()

                # 2. Block vision model loading
                await self._block_vision_model()

                # 3. Disable image generation (ComfyUI)
                await self._disable_image_generation()

                # 4. Pause text channel inference (queue messages)
                await self._pause_text_channels()

                # 5. Disable bipolar mode interactions (Miku/Evil Miku arguments)
                await self._disable_bipolar_mode()

                # 6. Disable profile picture switching
                await self._disable_profile_picture_switching()

                # 7. Pause autonomous engine
                await self._pause_autonomous_engine()

                # 8. Pause scheduled events
                await self._pause_scheduled_events()

                # 9. Pause figurine notifier
                await self._pause_figurine_notifier()

                # 10. Create voice session
                self.active_session = VoiceSession(guild_id, voice_channel, text_channel)

                # 11. Connect to Discord voice channel with VoiceRecvClient
                try:
                    voice_client = await voice_channel.connect(cls=voice_recv.VoiceRecvClient)
                    self.active_session.voice_client = voice_client
                    self.active_session.active = True
                    logger.info(f"✓ Connected to voice channel: {voice_channel.name} (with audio receiving)")
                except Exception as e:
                    logger.error(f"Failed to connect to voice channel: {e}", exc_info=True)
                    raise

                # 12. Start audio streaming (Phase 2)
                try:
                    await self.active_session.start_audio_streaming()
                    logger.info(f"✓ Audio streaming started")
                except Exception as e:
                    logger.error(f"Failed to start audio streaming: {e}", exc_info=True)
                    # Continue anyway - audio streaming is optional for Phase 2 testing

                logger.info(f"✓ Voice session started successfully")

            except Exception as e:
                logger.error(f"Failed to start voice session: {e}", exc_info=True)
                # Cleanup on failure
                await self._cleanup_failed_start()
                raise

    async def end_session(self):
        """
        End voice session and release all resources.
        """
        async with self.session_lock:
            if not self.active_session:
                logger.warning("No active voice session to end")
                return

            logger.info("Ending voice session...")

            try:
                # 1. Stop audio streaming
                if self.active_session:
                    try:
                        await self.active_session.stop_audio_streaming()
                    except Exception as e:
                        logger.error(f"Error stopping audio streaming: {e}")

                # 2. Disconnect from voice channel
                if self.active_session.voice_client:
                    try:
                        await self.active_session.voice_client.disconnect()
                        logger.info("✓ Disconnected from voice channel")
                    except Exception as e:
                        logger.error(f"Error disconnecting from voice: {e}")

                # 3. Resume text channel inference
                await self._resume_text_channels()

                # 4. Unblock vision model
                await self._unblock_vision_model()

                # 5. Re-enable image generation
                await self._enable_image_generation()

                # 6. Re-enable bipolar mode interactions
                await self._enable_bipolar_mode()

                # 7. Re-enable profile picture switching
                await self._enable_profile_picture_switching()

                # 8. Resume autonomous engine
                await self._resume_autonomous_engine()

                # 9. Resume scheduled events
                await self._resume_scheduled_events()

                # 10. Resume figurine notifier
                await self._resume_figurine_notifier()

                # 10. Clear active session
                self.active_session = None

                logger.info("✓ Voice session ended successfully, all resources released")

            except Exception as e:
                logger.error(f"Error during session cleanup: {e}", exc_info=True)
                # Force clear session even on error
                self.active_session = None
                raise

    # ==================== Resource Locking Methods ====================

    async def _switch_to_amd_gpu(self):
        """Switch text inference to AMD GPU (RX 6800)"""
        try:
            gpu_state_file = os.path.join("memory", "gpu_state.json")
            os.makedirs("memory", exist_ok=True)

            with open(gpu_state_file, "w") as f:
                json.dump({"current_gpu": "amd", "reason": "voice_session"}, f)

            logger.info("✓ Switched to AMD GPU for text inference")
        except Exception as e:
            logger.error(f"Failed to switch GPU: {e}")
            raise

    async def _block_vision_model(self):
        """Prevent vision model from loading during voice session"""
        globals.VISION_MODEL_BLOCKED = True
        logger.info("✓ Vision model blocked")

    async def _unblock_vision_model(self):
        """Allow vision model to load after voice session"""
        globals.VISION_MODEL_BLOCKED = False
        logger.info("✓ Vision model unblocked")

    async def _disable_image_generation(self):
        """Block ComfyUI image generation during voice session"""
        globals.IMAGE_GENERATION_BLOCKED = True
        globals.IMAGE_GENERATION_BLOCK_MESSAGE = (
            "🎤 I can't draw right now, I'm talking in voice chat! "
            "Ask me again after I leave the voice channel."
        )
        logger.info("✓ Image generation disabled")

    async def _enable_image_generation(self):
        """Re-enable image generation after voice session"""
        globals.IMAGE_GENERATION_BLOCKED = False
        globals.IMAGE_GENERATION_BLOCK_MESSAGE = None
        logger.info("✓ Image generation re-enabled")

    async def _pause_text_channels(self):
        """Queue text messages instead of processing during voice session"""
        globals.VOICE_SESSION_ACTIVE = True
        globals.TEXT_MESSAGE_QUEUE = []
        logger.info("✓ Text channels paused (messages will be queued)")

    async def _resume_text_channels(self):
        """Process queued messages after voice session"""
        globals.VOICE_SESSION_ACTIVE = False
        queued_count = len(globals.TEXT_MESSAGE_QUEUE)

        if queued_count > 0:
            logger.info(f"Resuming text channels, {queued_count} messages queued")
            # TODO: Process queue in Phase 2 (need message handler integration)
            # For now, just clear the queue
            globals.TEXT_MESSAGE_QUEUE = []
            logger.warning(f"Discarded {queued_count} queued messages (queue processing not yet implemented)")
        else:
            logger.info("✓ Text channels resumed (no queued messages)")

    async def _disable_bipolar_mode(self):
        """Prevent Miku/Evil Miku arguments during voice session"""
        try:
            from utils.bipolar_mode import pause_bipolar_interactions
            pause_bipolar_interactions()
            logger.info("✓ Bipolar mode interactions disabled")
        except ImportError:
            logger.warning("bipolar_mode module not found, skipping")
        except AttributeError:
            logger.warning("pause_bipolar_interactions not implemented yet, skipping")

    async def _enable_bipolar_mode(self):
        """Re-enable Miku/Evil Miku arguments after voice session"""
        try:
            from utils.bipolar_mode import resume_bipolar_interactions
            resume_bipolar_interactions()
            logger.info("✓ Bipolar mode interactions re-enabled")
        except ImportError:
            logger.warning("bipolar_mode module not found, skipping")
        except AttributeError:
            logger.warning("resume_bipolar_interactions not implemented yet, skipping")

    async def _disable_profile_picture_switching(self):
        """Lock profile picture during voice session"""
        try:
            from utils.profile_picture_manager import profile_picture_manager
            if hasattr(profile_picture_manager, 'lock_switching'):
                profile_picture_manager.lock_switching()
                logger.info("✓ Profile picture switching disabled")
            else:
                logger.warning("profile_picture_manager.lock_switching not implemented yet, skipping")
        except ImportError:
            logger.warning("profile_picture_manager module not found, skipping")

    async def _enable_profile_picture_switching(self):
        """Unlock profile picture after voice session"""
        try:
            from utils.profile_picture_manager import profile_picture_manager
            if hasattr(profile_picture_manager, 'unlock_switching'):
                profile_picture_manager.unlock_switching()
                logger.info("✓ Profile picture switching re-enabled")
            else:
                logger.warning("profile_picture_manager.unlock_switching not implemented yet, skipping")
        except ImportError:
            logger.warning("profile_picture_manager module not found, skipping")

    async def _pause_autonomous_engine(self):
        """Pause autonomous message generation during voice session"""
        try:
            from utils.autonomous import pause_autonomous_system
            pause_autonomous_system()
            logger.info("✓ Autonomous engine paused")
        except ImportError:
            logger.warning("autonomous module not found, skipping")
        except AttributeError:
            logger.warning("pause_autonomous_system not implemented yet, skipping")

    async def _resume_autonomous_engine(self):
        """Resume autonomous message generation after voice session"""
        try:
            from utils.autonomous import resume_autonomous_system
            resume_autonomous_system()
            logger.info("✓ Autonomous engine resumed")
        except ImportError:
            logger.warning("autonomous module not found, skipping")
        except AttributeError:
            logger.warning("resume_autonomous_system not implemented yet, skipping")

    async def _pause_scheduled_events(self):
        """Pause all scheduled jobs during voice session"""
        try:
            globals.scheduler.pause()
            logger.info("✓ Scheduled events paused")
        except Exception as e:
            logger.error(f"Failed to pause scheduler: {e}")

    async def _resume_scheduled_events(self):
        """Resume scheduled jobs after voice session"""
        try:
            globals.scheduler.resume()
            logger.info("✓ Scheduled events resumed")
        except Exception as e:
            logger.error(f"Failed to resume scheduler: {e}")

    async def _pause_figurine_notifier(self):
        """Pause figurine notifications during voice session"""
        try:
            # Assuming figurine notifier is a scheduled job
            globals.scheduler.pause_job('figurine_notifier')
            logger.info("✓ Figurine notifier paused")
        except Exception as e:
            # Job might not exist, that's okay
            logger.debug(f"Could not pause figurine notifier (may not exist): {e}")

    async def _resume_figurine_notifier(self):
        """Resume figurine notifications after voice session"""
        try:
            globals.scheduler.resume_job('figurine_notifier')
            logger.info("✓ Figurine notifier resumed")
        except Exception as e:
            # Job might not exist, that's okay
            logger.debug(f"Could not resume figurine notifier (may not exist): {e}")

    async def _cleanup_failed_start(self):
        """Cleanup resources if session start fails"""
        logger.warning("Cleaning up after failed session start...")
        try:
            # Disconnect from voice if connected
            if self.active_session and self.active_session.voice_client:
                try:
                    await self.active_session.voice_client.disconnect()
                except:
                    pass

            await self._unblock_vision_model()
            await self._enable_image_generation()
            await self._resume_text_channels()
            await self._enable_bipolar_mode()
            await self._enable_profile_picture_switching()
            await self._resume_autonomous_engine()
            await self._resume_scheduled_events()
            await self._resume_figurine_notifier()

            # Clear the session
            self.active_session = None
        except Exception as e:
            logger.error(f"Error during cleanup: {e}")


class VoiceSession:
    """
    Represents an active voice chat session with audio streaming.
    """

    def __init__(self, guild_id: int, voice_channel: discord.VoiceChannel, text_channel: discord.TextChannel):
        self.guild_id = guild_id
        self.voice_channel = voice_channel
        self.text_channel = text_channel
        self.voice_client: Optional[discord.VoiceClient] = None
        self.audio_source: Optional['MikuVoiceSource'] = None  # Forward reference
        self.tts_streamer: Optional['TTSTokenStreamer'] = None  # Forward reference
        self.voice_receiver: Optional['VoiceReceiver'] = None  # STT receiver
        self.active = False
        self.miku_speaking = False  # Track if Miku is currently speaking

        logger.info(f"VoiceSession created for {voice_channel.name} in guild {guild_id}")

    async def start_audio_streaming(self):
        """
        Start audio streaming from TTS WebSocket to Discord voice.
        This should be called after voice_client is connected.
        """
        from utils.voice_audio import MikuVoiceSource

        try:
            # Create and connect audio source (handles both sending tokens and receiving audio)
            self.audio_source = MikuVoiceSource()
            await self.audio_source.connect()

            # The audio_source now serves as both the audio source AND the token sender
            # Set tts_streamer to point to audio_source for backwards compatibility
            self.tts_streamer = self.audio_source

            # Start playing audio to Discord
            if self.voice_client and not self.voice_client.is_playing():
                self.voice_client.play(self.audio_source)
                logger.info("✓ Started audio streaming to Discord")

        except Exception as e:
            logger.error(f"Failed to start audio streaming: {e}", exc_info=True)
            raise

    async def stop_audio_streaming(self):
        """Stop audio streaming and cleanup resources."""
        try:
            # Stop Discord audio playback
            if self.voice_client and self.voice_client.is_playing():
                self.voice_client.stop()

            # Disconnect audio source (which also handles token streaming)
            if self.audio_source:
                await self.audio_source.disconnect()
                self.audio_source = None
                self.tts_streamer = None  # Clear reference since it pointed to audio_source

            logger.info("✓ Stopped audio streaming")

        except Exception as e:
            logger.error(f"Error stopping audio streaming: {e}", exc_info=True)

    async def start_listening(self, user: discord.User):
        """
        Start listening to a user's voice (STT).

        Args:
            user: Discord user to listen to
        """
        from utils.voice_receiver import VoiceReceiverSink

        try:
            # Create receiver if not exists
            if not self.voice_receiver:
                self.voice_receiver = VoiceReceiverSink(self)

                # Start receiving audio from Discord using discord-ext-voice-recv
                if self.voice_client:
                    self.voice_client.listen(self.voice_receiver)
                    logger.info("✓ Discord voice receive started (discord-ext-voice-recv)")

            # Start listening to specific user
            await self.voice_receiver.start_listening(user.id, user)
            logger.info(f"✓ Started listening to {user.name}")

        except Exception as e:
            logger.error(f"Failed to start listening to {user.name}: {e}", exc_info=True)
            raise

    async def stop_listening(self, user_id: int):
        """
        Stop listening to a user.

        Args:
            user_id: Discord user ID
        """
        if self.voice_receiver:
            await self.voice_receiver.stop_listening(user_id)
            logger.info(f"✓ Stopped listening to user {user_id}")

    async def stop_all_listening(self):
        """Stop listening to all users."""
        if self.voice_receiver:
            await self.voice_receiver.stop_all()
            self.voice_receiver = None
            logger.info("✓ Stopped all listening")

    async def on_user_vad_event(self, user_id: int, event: dict):
        """Called when VAD detects speech state change."""
        event_type = event.get('event')
        logger.debug(f"User {user_id} VAD: {event_type}")

    async def on_partial_transcript(self, user_id: int, text: str):
        """Called when partial transcript is received."""
        logger.info(f"Partial from user {user_id}: {text}")
        # Could show "User is saying..." in chat

    async def on_final_transcript(self, user_id: int, text: str):
        """
        Called when final transcript is received.
        This triggers LLM response and TTS.
        """
        logger.info(f"Final from user {user_id}: {text}")

        # Get user info
        user = self.voice_channel.guild.get_member(user_id)
        if not user:
            logger.warning(f"User {user_id} not found in guild")
            return

        # Show what user said
        await self.text_channel.send(f"🎤 {user.name}: *\"{text}\"*")

        # Generate LLM response and speak it
        await self._generate_voice_response(user, text)

    async def on_user_interruption(self, user_id: int, probability: float):
        """
        Called when user interrupts Miku's speech.
        Cancel TTS and switch to listening.
        """
        if not self.miku_speaking:
            return

        logger.info(f"User {user_id} interrupted Miku (prob={probability:.3f})")

        # Cancel Miku's speech
        await self._cancel_tts()

        # Show interruption in chat
        user = self.voice_channel.guild.get_member(user_id)
        await self.text_channel.send(f"⚠️ *{user.name if user else 'User'} interrupted Miku*")

    async def _generate_voice_response(self, user: discord.User, text: str):
        """
        Generate LLM response and speak it.

        Args:
            user: User who spoke
            text: Transcribed text
        """
        try:
            self.miku_speaking = True

            # Show processing
            await self.text_channel.send(f"💭 *Miku is thinking...*")

            # Import here to avoid circular imports
            from utils.llm import get_current_gpu_url
            import aiohttp
            import globals

            # Simple system prompt for voice
            system_prompt = """You are Hatsune Miku, the virtual singer.
Respond naturally and concisely as Miku would in a voice conversation.
Keep responses short (1-3 sentences) since they will be spoken aloud."""

            payload = {
                "model": globals.TEXT_MODEL,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": text}
                ],
                "stream": True,
                "temperature": 0.8,
                "max_tokens": 200
            }

            headers = {'Content-Type': 'application/json'}
            llama_url = get_current_gpu_url()

            # Stream LLM response to TTS
            full_response = ""
            async with aiohttp.ClientSession() as http_session:
                async with http_session.post(
                    f"{llama_url}/v1/chat/completions",
                    json=payload,
                    headers=headers,
                    timeout=aiohttp.ClientTimeout(total=60)
                ) as response:
                    if response.status != 200:
                        error_text = await response.text()
                        raise Exception(f"LLM error {response.status}: {error_text}")

                    # Stream tokens to TTS
                    async for line in response.content:
                        if not self.miku_speaking:
                            # Interrupted
                            break

                        line = line.decode('utf-8').strip()
                        if line.startswith('data: '):
                            data_str = line[6:]
                            if data_str == '[DONE]':
                                break

                            try:
                                import json
                                data = json.loads(data_str)
                                if 'choices' in data and len(data['choices']) > 0:
                                    delta = data['choices'][0].get('delta', {})
                                    content = delta.get('content', '')
                                    if content:
                                        await self.audio_source.send_token(content)
                                        full_response += content
                            except json.JSONDecodeError:
                                continue

            # Flush TTS
            if self.miku_speaking:
                await self.audio_source.flush()

                # Show response
                await self.text_channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
                logger.info(f"✓ Voice response complete: {full_response.strip()}")

        except Exception as e:
            logger.error(f"Voice response failed: {e}", exc_info=True)
            await self.text_channel.send(f"❌ Sorry, I had trouble responding")

        finally:
            self.miku_speaking = False

    async def _cancel_tts(self):
        """Cancel current TTS synthesis."""
        logger.info("Canceling TTS synthesis")

        # Stop Discord playback
        if self.voice_client and self.voice_client.is_playing():
            self.voice_client.stop()

        # Send interrupt to RVC
        try:
            import aiohttp
            async with aiohttp.ClientSession() as session:
                async with session.post("http://172.25.0.1:8765/interrupt") as resp:
                    if resp.status == 200:
                        logger.info("✓ TTS interrupted")
        except Exception as e:
            logger.error(f"Failed to interrupt TTS: {e}")

        self.miku_speaking = False


# Global singleton instance
voice_manager = VoiceSessionManager()