From 911f11ee9fa843fade9cbd9547435e544426cc31 Mon Sep 17 00:00:00 2001 From: koko210Serve Date: Fri, 16 Jan 2026 13:01:08 +0200 Subject: [PATCH] Untested Phase 1 (Foundation & Resource management) of voice chat integration --- bot/bot.py | 28 + bot/commands/voice.py | 229 +++ bot/globals.py | 9 + bot/requirements.txt | 2 + bot/utils/autonomous.py | 22 + bot/utils/bipolar_mode.py | 30 + bot/utils/profile_picture_manager.py | 21 + bot/utils/voice_manager.py | 358 +++++ readmes/VOICE_CHAT_IMPLEMENTATION_PLAN.md | 1589 +++++++++++++++++++++ 9 files changed, 2288 insertions(+) create mode 100644 bot/commands/voice.py create mode 100644 bot/utils/voice_manager.py create mode 100644 readmes/VOICE_CHAT_IMPLEMENTATION_PLAN.md diff --git a/bot/bot.py b/bot/bot.py index 8d604ef..daee3cc 100644 --- a/bot/bot.py +++ b/bot/bot.py @@ -125,6 +125,19 @@ async def on_message(message): if message.author == globals.client.user: return + # Check for voice commands first (!miku join, !miku leave, !miku voice-status) + if not isinstance(message.channel, discord.DMChannel) and message.content.strip().lower().startswith('!miku '): + from commands.voice import handle_voice_command + + parts = message.content.strip().split() + if len(parts) >= 2: + cmd = parts[1].lower() + args = parts[2:] if len(parts) > 2 else [] + + if cmd in ['join', 'leave', 'voice-status']: + await handle_voice_command(message, cmd, args) + return + # Skip processing if a bipolar argument is in progress in this channel if not isinstance(message.channel, discord.DMChannel): from utils.bipolar_mode import is_argument_in_progress @@ -196,6 +209,14 @@ async def on_message(message): logger.error(f"Failed to fetch replied message for context: {e}") async with message.channel.typing(): + # Check if vision model is blocked (voice session active) + if message.attachments and globals.VISION_MODEL_BLOCKED: + await message.channel.send( + "🎀 I can't look at images or videos right now, I'm talking in voice chat! " + "Send it again after I leave the voice channel." + ) + return + # If message has an image, video, or GIF attachment if message.attachments: for attachment in message.attachments: @@ -504,6 +525,13 @@ async def on_message(message): if is_image_request and image_prompt: logger.info(f"🎨 Image generation request detected: '{image_prompt}' from {message.author.display_name}") + # Block image generation during voice sessions + if globals.IMAGE_GENERATION_BLOCKED: + await message.channel.send(globals.IMAGE_GENERATION_BLOCK_MESSAGE) + await message.add_reaction('🎀') + logger.info("🚫 Image generation blocked - voice session active") + return + # Handle the image generation workflow success = await handle_image_generation_request(message, image_prompt) if success: diff --git a/bot/commands/voice.py b/bot/commands/voice.py new file mode 100644 index 0000000..2267e88 --- /dev/null +++ b/bot/commands/voice.py @@ -0,0 +1,229 @@ +# voice.py +""" +Voice channel commands for Miku Discord bot. +Handles joining, leaving, and status commands for voice chat sessions. +""" + +import discord +from utils.voice_manager import voice_manager +from utils.logger import get_logger + +logger = get_logger('voice_commands') + + +async def handle_voice_command(message, cmd, args): + """ + Handle voice-related commands. + + Args: + message: Discord message object + cmd: Command name (join, leave, voice-status) + args: Command arguments + """ + + if cmd == 'join': + await _handle_join(message, args) + + elif cmd == 'leave': + await _handle_leave(message) + + elif cmd == 'voice-status': + await _handle_status(message) + + else: + await message.channel.send(f"❌ Unknown voice command: `{cmd}`") + + +async def _handle_join(message, args): + """ + Handle !miku join command. + Join voice channel and start session with resource locks. + """ + # Get voice channel + voice_channel = None + + if args and args[0].startswith('<#'): + # Channel mentioned (e.g., !miku join #voice-chat) + try: + channel_id = int(args[0][2:-1]) + voice_channel = message.guild.get_channel(channel_id) + + if not isinstance(voice_channel, discord.VoiceChannel): + await message.channel.send("❌ That's not a voice channel!") + return + except (ValueError, AttributeError): + await message.channel.send("❌ Invalid channel!") + return + + else: + # Use user's current voice channel + if message.author.voice and message.author.voice.channel: + voice_channel = message.author.voice.channel + else: + await message.channel.send( + "❌ You must be in a voice channel! " + "Or mention a voice channel like `!miku join #voice-chat`" + ) + return + + # Check permissions + if not voice_channel.permissions_for(message.guild.me).connect: + await message.channel.send(f"❌ I don't have permission to join {voice_channel.mention}!") + return + + if not voice_channel.permissions_for(message.guild.me).speak: + await message.channel.send(f"❌ I don't have permission to speak in {voice_channel.mention}!") + return + + # Start session + try: + await message.channel.send(f"🎀 Joining {voice_channel.mention}...") + + await voice_manager.start_session( + message.guild.id, + voice_channel, + message.channel # Use current text channel for prompts + ) + + embed = discord.Embed( + title="🎀 Voice Chat Active", + description=f"I've joined {voice_channel.mention}!", + color=discord.Color.from_rgb(134, 206, 203) # Miku teal + ) + embed.add_field( + name="How to use", + value=f"Send messages in {message.channel.mention} to make me speak!", + inline=False + ) + embed.add_field( + name="⚠️ Resource Mode", + value=( + "β€’ Text inference on AMD GPU only\n" + "β€’ Vision model disabled\n" + "β€’ Image generation disabled\n" + "β€’ Other text channels paused" + ), + inline=False + ) + embed.set_footer(text="Use !miku leave to end the session") + + await message.channel.send(embed=embed) + + logger.info(f"Voice session started by {message.author} in {voice_channel.name}") + + except Exception as e: + await message.channel.send(f"❌ Failed to join voice: {str(e)}") + logger.error(f"Failed to start voice session: {e}", exc_info=True) + + +async def _handle_leave(message): + """ + Handle !miku leave command. + Leave voice channel and release all resources. + """ + if not voice_manager.active_session: + await message.channel.send("❌ I'm not in a voice channel!") + return + + # Check if user is in the same guild as the active session + if voice_manager.active_session.guild_id != message.guild.id: + await message.channel.send("❌ I'm in a voice channel in a different server!") + return + + try: + voice_channel_name = voice_manager.active_session.voice_channel.name + + await message.channel.send("πŸ‘‹ Leaving voice channel...") + + await voice_manager.end_session() + + embed = discord.Embed( + title="πŸ‘‹ Voice Chat Ended", + description=f"Left {voice_channel_name}", + color=discord.Color.from_rgb(134, 206, 203) + ) + embed.add_field( + name="βœ… Resources Released", + value=( + "β€’ Vision model available\n" + "β€’ Image generation available\n" + "β€’ Text channels resumed\n" + "β€’ All features restored" + ), + inline=False + ) + + await message.channel.send(embed=embed) + + logger.info(f"Voice session ended by {message.author}") + + except Exception as e: + await message.channel.send(f"⚠️ Error leaving voice: {str(e)}") + logger.error(f"Failed to end voice session: {e}", exc_info=True) + + +async def _handle_status(message): + """ + Handle !miku voice-status command. + Show current voice session status. + """ + if not voice_manager.active_session: + embed = discord.Embed( + title="πŸ”‡ No Active Voice Session", + description="I'm not currently in a voice channel.", + color=discord.Color.greyple() + ) + embed.add_field( + name="To start", + value="Use `!miku join` while in a voice channel", + inline=False + ) + await message.channel.send(embed=embed) + return + + session = voice_manager.active_session + + # Check if in same guild + if session.guild_id != message.guild.id: + await message.channel.send("ℹ️ I'm in a voice channel in a different server.") + return + + embed = discord.Embed( + title="🎀 Voice Session Active", + description=f"Currently in voice chat", + color=discord.Color.from_rgb(134, 206, 203) + ) + + embed.add_field( + name="Voice Channel", + value=session.voice_channel.mention, + inline=True + ) + + embed.add_field( + name="Prompt Channel", + value=session.text_channel.mention, + inline=True + ) + + embed.add_field( + name="πŸ“Š Resource Allocation", + value=( + "**GPU Usage:**\n" + "β€’ AMD RX 6800: Text model + RVC\n" + "β€’ GTX 1660: Soprano TTS only\n\n" + "**Blocked Features:**\n" + "β€’ ❌ Vision model\n" + "β€’ ❌ Image generation\n" + "β€’ ❌ Bipolar mode\n" + "β€’ ❌ Profile picture changes\n" + "β€’ ⏸️ Autonomous engine\n" + "β€’ ⏸️ Scheduled events\n" + "β€’ πŸ“¦ Other text channels (queued)" + ), + inline=False + ) + + embed.set_footer(text="Use !miku leave to end the session") + + await message.channel.send(embed=embed) diff --git a/bot/globals.py b/bot/globals.py index 578e0a2..064ba50 100644 --- a/bot/globals.py +++ b/bot/globals.py @@ -96,3 +96,12 @@ LAST_FULL_PROMPT = "" # Persona Dialogue System (conversations between Miku and Evil Miku) LAST_PERSONA_DIALOGUE_TIME = 0 # Timestamp of last dialogue for cooldown +# Voice Chat Session State +VOICE_SESSION_ACTIVE = False +TEXT_MESSAGE_QUEUE = [] # Queue for messages received during voice session + +# Feature Blocking Flags (set during voice session) +VISION_MODEL_BLOCKED = False +IMAGE_GENERATION_BLOCKED = False +IMAGE_GENERATION_BLOCK_MESSAGE = None + diff --git a/bot/requirements.txt b/bot/requirements.txt index 4ae5c22..bd5808e 100644 --- a/bot/requirements.txt +++ b/bot/requirements.txt @@ -20,3 +20,5 @@ numpy scikit-learn transformers torch +PyNaCl>=1.5.0 +websockets>=12.0 diff --git a/bot/utils/autonomous.py b/bot/utils/autonomous.py index ff7f403..499d606 100644 --- a/bot/utils/autonomous.py +++ b/bot/utils/autonomous.py @@ -17,12 +17,34 @@ logger = get_logger('autonomous') _last_action_execution = {} # guild_id -> timestamp _MIN_ACTION_INTERVAL = 30 # Minimum 30 seconds between autonomous actions +# Pause state for voice sessions +_autonomous_paused = False + + +def pause_autonomous_system(): + """Pause autonomous message generation (called during voice sessions)""" + global _autonomous_paused + _autonomous_paused = True + logger.info("Autonomous system paused") + + +def resume_autonomous_system(): + """Resume autonomous message generation (called after voice sessions)""" + global _autonomous_paused + _autonomous_paused = False + logger.info("Autonomous system resumed") + async def autonomous_tick_v2(guild_id: int): """ New autonomous tick that uses context-aware decision making. Replaces the random 10% chance with intelligent decision. """ + # Check if autonomous is paused (voice session) + if _autonomous_paused: + logger.debug(f"[V2] Autonomous system paused (voice session active)") + return + # Rate limiting check now = time.time() if guild_id in _last_action_execution: diff --git a/bot/utils/bipolar_mode.py b/bot/utils/bipolar_mode.py index bc1108b..5bfe5a6 100644 --- a/bot/utils/bipolar_mode.py +++ b/bot/utils/bipolar_mode.py @@ -28,6 +28,31 @@ MIN_EXCHANGES = 4 # Minimum number of back-and-forth exchanges before ending ca ARGUMENT_TRIGGER_CHANCE = 0.15 # 15% chance for the other Miku to break through DELAY_BETWEEN_MESSAGES = (2.0, 5.0) # Random delay between argument messages (seconds) +# Pause state for voice sessions +_bipolar_interactions_paused = False + +# ============================================================================ +# VOICE SESSION PAUSE/RESUME +# ============================================================================ + +def pause_bipolar_interactions(): + """Pause all bipolar interactions (called during voice sessions)""" + global _bipolar_interactions_paused + _bipolar_interactions_paused = True + logger.info("Bipolar interactions paused") + + +def resume_bipolar_interactions(): + """Resume bipolar interactions (called after voice sessions)""" + global _bipolar_interactions_paused + _bipolar_interactions_paused = False + logger.info("Bipolar interactions resumed") + + +def is_bipolar_paused(): + """Check if bipolar interactions are currently paused""" + return _bipolar_interactions_paused + # ============================================================================ # STATE PERSISTENCE # ============================================================================ @@ -1039,6 +1064,11 @@ async def maybe_trigger_argument(channel: discord.TextChannel, client, context: if not globals.BIPOLAR_MODE: return False + # Check if bipolar interactions are paused (voice session) + if is_bipolar_paused(): + logger.debug("Bipolar argument blocked (voice session active)") + return False + if is_argument_in_progress(channel.id): return False diff --git a/bot/utils/profile_picture_manager.py b/bot/utils/profile_picture_manager.py index 7b00ca4..d78f59e 100644 --- a/bot/utils/profile_picture_manager.py +++ b/bot/utils/profile_picture_manager.py @@ -47,6 +47,17 @@ class ProfilePictureManager: def __init__(self): self._ensure_directories() + self.switching_locked = False # Lock for voice session + + def lock_switching(self): + """Lock profile picture changes during voice session""" + self.switching_locked = True + logger.info("Profile picture switching locked") + + def unlock_switching(self): + """Unlock profile picture changes after voice session""" + self.switching_locked = False + logger.info("Profile picture switching unlocked") def _ensure_directories(self): """Ensure profile picture directory exists""" @@ -247,6 +258,16 @@ class ProfilePictureManager: Returns: Dict with status and metadata """ + # Check if switching is locked (voice session active) + if self.switching_locked: + logger.info("Profile picture change blocked (voice session active)") + return { + "success": False, + "source": None, + "error": "Profile picture switching locked during voice session", + "metadata": {} + } + result = { "success": False, "source": None, diff --git a/bot/utils/voice_manager.py b/bot/utils/voice_manager.py new file mode 100644 index 0000000..3683ea0 --- /dev/null +++ b/bot/utils/voice_manager.py @@ -0,0 +1,358 @@ +# voice_manager.py +""" +Voice session manager for Miku Discord bot. +Handles Discord voice channel connections, resource locking, and feature blocking during voice sessions. + +During a voice session: +- GPU switches to AMD for text inference only +- Vision model is blocked (keeps GTX 1660 for TTS) +- Image generation is blocked +- Bipolar mode interactions are disabled +- Profile picture switching is locked +- Autonomous engine is paused +- Scheduled events are paused +- Text channels are paused (messages queued) +""" + +import asyncio +import json +import os +from typing import Optional +import discord +import globals +from utils.logger import get_logger + +logger = get_logger('voice_manager') + + +class VoiceSessionManager: + """ + Singleton manager for voice chat sessions. + Ensures only one voice session active at a time and manages all resource locks. + """ + + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance._initialized = False + return cls._instance + + def __init__(self): + if self._initialized: + return + + self.active_session: Optional['VoiceSession'] = None + self.session_lock = asyncio.Lock() + self._initialized = True + logger.info("VoiceSessionManager initialized") + + async def start_session(self, guild_id: int, voice_channel: discord.VoiceChannel, text_channel: discord.TextChannel): + """ + Start a voice session with full resource locking. + + Args: + guild_id: Discord guild ID + voice_channel: Voice channel to join + text_channel: Text channel for voice prompts + + Raises: + Exception: If session already active or resources can't be locked + """ + async with self.session_lock: + if self.active_session: + raise Exception("Voice session already active") + + logger.info(f"Starting voice session in {voice_channel.name} (guild {guild_id})") + + try: + # 1. Switch to AMD GPU for text inference + await self._switch_to_amd_gpu() + + # 2. Block vision model loading + await self._block_vision_model() + + # 3. Disable image generation (ComfyUI) + await self._disable_image_generation() + + # 4. Pause text channel inference (queue messages) + await self._pause_text_channels() + + # 5. Disable bipolar mode interactions (Miku/Evil Miku arguments) + await self._disable_bipolar_mode() + + # 6. Disable profile picture switching + await self._disable_profile_picture_switching() + + # 7. Pause autonomous engine + await self._pause_autonomous_engine() + + # 8. Pause scheduled events + await self._pause_scheduled_events() + + # 9. Pause figurine notifier + await self._pause_figurine_notifier() + + # 10. Create and connect voice session + self.active_session = VoiceSession(guild_id, voice_channel, text_channel) + # Note: Actual voice connection will be implemented in Phase 2 + + logger.info(f"βœ“ Voice session started successfully") + + except Exception as e: + logger.error(f"Failed to start voice session: {e}", exc_info=True) + # Cleanup on failure + await self._cleanup_failed_start() + raise + + async def end_session(self): + """ + End voice session and release all resources. + """ + async with self.session_lock: + if not self.active_session: + logger.warning("No active voice session to end") + return + + logger.info("Ending voice session...") + + try: + # 1. Disconnect from voice (Phase 2 implementation) + # await self.active_session.disconnect() + + # 2. Resume text channel inference + await self._resume_text_channels() + + # 3. Unblock vision model + await self._unblock_vision_model() + + # 4. Re-enable image generation + await self._enable_image_generation() + + # 5. Re-enable bipolar mode interactions + await self._enable_bipolar_mode() + + # 6. Re-enable profile picture switching + await self._enable_profile_picture_switching() + + # 7. Resume autonomous engine + await self._resume_autonomous_engine() + + # 8. Resume scheduled events + await self._resume_scheduled_events() + + # 9. Resume figurine notifier + await self._resume_figurine_notifier() + + # 10. Clear active session + self.active_session = None + + logger.info("βœ“ Voice session ended successfully, all resources released") + + except Exception as e: + logger.error(f"Error during session cleanup: {e}", exc_info=True) + # Force clear session even on error + self.active_session = None + raise + + # ==================== Resource Locking Methods ==================== + + async def _switch_to_amd_gpu(self): + """Switch text inference to AMD GPU (RX 6800)""" + try: + gpu_state_file = os.path.join("memory", "gpu_state.json") + os.makedirs("memory", exist_ok=True) + + with open(gpu_state_file, "w") as f: + json.dump({"current_gpu": "amd", "reason": "voice_session"}, f) + + logger.info("βœ“ Switched to AMD GPU for text inference") + except Exception as e: + logger.error(f"Failed to switch GPU: {e}") + raise + + async def _block_vision_model(self): + """Prevent vision model from loading during voice session""" + globals.VISION_MODEL_BLOCKED = True + logger.info("βœ“ Vision model blocked") + + async def _unblock_vision_model(self): + """Allow vision model to load after voice session""" + globals.VISION_MODEL_BLOCKED = False + logger.info("βœ“ Vision model unblocked") + + async def _disable_image_generation(self): + """Block ComfyUI image generation during voice session""" + globals.IMAGE_GENERATION_BLOCKED = True + globals.IMAGE_GENERATION_BLOCK_MESSAGE = ( + "🎀 I can't draw right now, I'm talking in voice chat! " + "Ask me again after I leave the voice channel." + ) + logger.info("βœ“ Image generation disabled") + + async def _enable_image_generation(self): + """Re-enable image generation after voice session""" + globals.IMAGE_GENERATION_BLOCKED = False + globals.IMAGE_GENERATION_BLOCK_MESSAGE = None + logger.info("βœ“ Image generation re-enabled") + + async def _pause_text_channels(self): + """Queue text messages instead of processing during voice session""" + globals.VOICE_SESSION_ACTIVE = True + globals.TEXT_MESSAGE_QUEUE = [] + logger.info("βœ“ Text channels paused (messages will be queued)") + + async def _resume_text_channels(self): + """Process queued messages after voice session""" + globals.VOICE_SESSION_ACTIVE = False + queued_count = len(globals.TEXT_MESSAGE_QUEUE) + + if queued_count > 0: + logger.info(f"Resuming text channels, {queued_count} messages queued") + # TODO: Process queue in Phase 2 (need message handler integration) + # For now, just clear the queue + globals.TEXT_MESSAGE_QUEUE = [] + logger.warning(f"Discarded {queued_count} queued messages (queue processing not yet implemented)") + else: + logger.info("βœ“ Text channels resumed (no queued messages)") + + async def _disable_bipolar_mode(self): + """Prevent Miku/Evil Miku arguments during voice session""" + try: + from utils.bipolar_mode import pause_bipolar_interactions + pause_bipolar_interactions() + logger.info("βœ“ Bipolar mode interactions disabled") + except ImportError: + logger.warning("bipolar_mode module not found, skipping") + except AttributeError: + logger.warning("pause_bipolar_interactions not implemented yet, skipping") + + async def _enable_bipolar_mode(self): + """Re-enable Miku/Evil Miku arguments after voice session""" + try: + from utils.bipolar_mode import resume_bipolar_interactions + resume_bipolar_interactions() + logger.info("βœ“ Bipolar mode interactions re-enabled") + except ImportError: + logger.warning("bipolar_mode module not found, skipping") + except AttributeError: + logger.warning("resume_bipolar_interactions not implemented yet, skipping") + + async def _disable_profile_picture_switching(self): + """Lock profile picture during voice session""" + try: + from utils.profile_picture_manager import profile_picture_manager + if hasattr(profile_picture_manager, 'lock_switching'): + profile_picture_manager.lock_switching() + logger.info("βœ“ Profile picture switching disabled") + else: + logger.warning("profile_picture_manager.lock_switching not implemented yet, skipping") + except ImportError: + logger.warning("profile_picture_manager module not found, skipping") + + async def _enable_profile_picture_switching(self): + """Unlock profile picture after voice session""" + try: + from utils.profile_picture_manager import profile_picture_manager + if hasattr(profile_picture_manager, 'unlock_switching'): + profile_picture_manager.unlock_switching() + logger.info("βœ“ Profile picture switching re-enabled") + else: + logger.warning("profile_picture_manager.unlock_switching not implemented yet, skipping") + except ImportError: + logger.warning("profile_picture_manager module not found, skipping") + + async def _pause_autonomous_engine(self): + """Pause autonomous message generation during voice session""" + try: + from utils.autonomous import pause_autonomous_system + pause_autonomous_system() + logger.info("βœ“ Autonomous engine paused") + except ImportError: + logger.warning("autonomous module not found, skipping") + except AttributeError: + logger.warning("pause_autonomous_system not implemented yet, skipping") + + async def _resume_autonomous_engine(self): + """Resume autonomous message generation after voice session""" + try: + from utils.autonomous import resume_autonomous_system + resume_autonomous_system() + logger.info("βœ“ Autonomous engine resumed") + except ImportError: + logger.warning("autonomous module not found, skipping") + except AttributeError: + logger.warning("resume_autonomous_system not implemented yet, skipping") + + async def _pause_scheduled_events(self): + """Pause all scheduled jobs during voice session""" + try: + globals.scheduler.pause() + logger.info("βœ“ Scheduled events paused") + except Exception as e: + logger.error(f"Failed to pause scheduler: {e}") + + async def _resume_scheduled_events(self): + """Resume scheduled jobs after voice session""" + try: + globals.scheduler.resume() + logger.info("βœ“ Scheduled events resumed") + except Exception as e: + logger.error(f"Failed to resume scheduler: {e}") + + async def _pause_figurine_notifier(self): + """Pause figurine notifications during voice session""" + try: + # Assuming figurine notifier is a scheduled job + globals.scheduler.pause_job('figurine_notifier') + logger.info("βœ“ Figurine notifier paused") + except Exception as e: + # Job might not exist, that's okay + logger.debug(f"Could not pause figurine notifier (may not exist): {e}") + + async def _resume_figurine_notifier(self): + """Resume figurine notifications after voice session""" + try: + globals.scheduler.resume_job('figurine_notifier') + logger.info("βœ“ Figurine notifier resumed") + except Exception as e: + # Job might not exist, that's okay + logger.debug(f"Could not resume figurine notifier (may not exist): {e}") + + async def _cleanup_failed_start(self): + """Cleanup resources if session start fails""" + logger.warning("Cleaning up after failed session start...") + try: + await self._unblock_vision_model() + await self._enable_image_generation() + await self._resume_text_channels() + await self._enable_bipolar_mode() + await self._enable_profile_picture_switching() + await self._resume_autonomous_engine() + await self._resume_scheduled_events() + await self._resume_figurine_notifier() + except Exception as e: + logger.error(f"Error during cleanup: {e}") + + +class VoiceSession: + """ + Represents an active voice chat session. + Phase 1: Basic structure only, voice connection in Phase 2. + """ + + def __init__(self, guild_id: int, voice_channel: discord.VoiceChannel, text_channel: discord.TextChannel): + self.guild_id = guild_id + self.voice_channel = voice_channel + self.text_channel = text_channel + self.voice_client: Optional[discord.VoiceClient] = None + self.active = False + + logger.info(f"VoiceSession created for {voice_channel.name} in guild {guild_id}") + + # Phase 2: Implement voice connection, audio streaming, TTS integration + + +# Global singleton instance +voice_manager = VoiceSessionManager() diff --git a/readmes/VOICE_CHAT_IMPLEMENTATION_PLAN.md b/readmes/VOICE_CHAT_IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..77e94cd --- /dev/null +++ b/readmes/VOICE_CHAT_IMPLEMENTATION_PLAN.md @@ -0,0 +1,1589 @@ +# Miku Voice Channel Chat Feature - Implementation Plan + +## Executive Summary + +This document outlines a comprehensive plan to implement real-time voice channel functionality for Miku, enabling her to: +- Join Discord voice channels and speak using her TTS pipeline +- Stream text tokens from llama-swap's LLM directly to TTS for maximum real-time responsiveness +- Accept text-based prompts from a designated text channel as a temporary input method +- Manage system resources strictly to maintain performance on constrained hardware + +### ⚠️ Critical Resource Management Requirements + +Due to tight hardware constraints, **multiple bot features must be disabled** during voice sessions: + +| Feature | Action During Voice | Reason | +|---------|---------------------|--------| +| Vision Model | ❌ Blocked | Frees GTX 1660 for TTS only | +| Image Generation | ❌ Blocked | Prevents ComfyUI GPU usage | +| Bipolar Mode | ❌ Disabled | Prevents dual-personality interactions | +| Profile Pictures | πŸ”’ Locked | Avoids Discord API overhead | +| Autonomous Engine | ⏸️ Paused | Reduces inference load | +| Scheduled Events | ⏸️ Paused | Prevents background tasks | +| Figurine Notifier | ⏸️ Paused | Prevents background tasks | +| Text Channels | πŸ“¦ Queued | Messages processed after session | + +**GPU Allocation During Voice:** +- **GTX 1660**: Soprano TTS only (no LLM, no vision) +- **AMD RX 6800**: RVC API + llama-swap-amd text model (~10-12GB) + +--- + +## 1. System Architecture Overview + +### 1.1 Current TTS Pipeline (soprano_to_rvc) +**Components:** +- **Soprano TTS Server** (GTX 1660 + CUDA) + - Runs in `miku-soprano-tts` container + - Listens on ZMQ port 5555 (internal network) + - Converts text β†’ 32kHz audio + +- **RVC API Server** (AMD RX 6800 + ROCm) + - Runs in `miku-rvc-api` container + - Exposes WebSocket endpoint: `ws://localhost:8765/ws/stream` + - Exposes HTTP endpoint: `http://localhost:8765` + - Converts Soprano output β†’ Miku voice (48kHz) + +**WebSocket Protocol** (`/ws/stream`): +``` +Client β†’ Server: {"token": "Hello", "pitch_shift": 0} +Client β†’ Server: {"token": " world"} +Client β†’ Server: {"token": "!", "flush": false} +Server β†’ Client: [binary PCM float32 audio @ 48kHz] +``` + +### 1.2 Current LLM Infrastructure +**Text Models:** +- **llama-swap** (GTX 1660) - Port 8090 + - Models: llama3.1, darkidol, vision + - Supports streaming via `/completion` endpoint with `stream=true` + +- **llama-swap-amd** (AMD RX 6800) - Port 8091 + - Models: llama3.1, darkidol (no vision) + - Supports streaming via `/completion` endpoint with `stream=true` + +### 1.3 Current Discord Bot Architecture +**Main Components:** +- `bot/bot.py` - Main Discord client event handler +- `bot/globals.py` - Global state and configuration +- `bot/utils/llm.py` - LLM query interface +- `bot/server_manager.py` - Multi-server configuration management + +**Key Features:** +- Multi-server support with per-server mood/config +- DM support with separate mood system +- Evil mode (alternate personality with uncensored model) +- Bipolar mode (both personalities can interact) +- Vision model integration for images/videos + +--- + +## 2. Voice Channel Feature Requirements + +### 2.1 Core Functionality +1. **Voice Channel Connection** + - Miku can join a voice channel via command (e.g., `!miku join`) + - Miku can leave via command (e.g., `!miku leave`) + - Only one voice session active at a time (resource constraint) + +2. **Real-Time Text-to-Speech** + - Stream LLM tokens directly to TTS WebSocket + - Send audio chunks to Discord voice as they're generated + - Minimize latency between token generation and audio playback + +3. **Text-Based Input (Temporary)** + - Designated text channel for prompting Miku (e.g., `#miku-voice-prompt`) + - Messages in this channel trigger voice responses + - Only active when Miku is in voice channel + +4. **Resource Management** + - **GPU Switching**: Use llama-swap-amd (RX 6800) exclusively for text generation + - **Vision Model Blocking**: Prevent vision model from loading during voice session + - **Text Channel Pausing**: Pause/queue regular text channel inference + - **Cleanup**: Properly release resources when voice session ends + +### 2.2 User Experience Goals +- **Low Latency**: First audio chunk should play within 1-2 seconds of prompt +- **Natural Speech**: Sentence boundaries should be respected for natural pauses +- **Reliable**: Graceful error handling and recovery +- **Non-Intrusive**: Shouldn't break existing bot functionality + +--- + +## 3. Resource Management Strategy + +### 3.1 Hardware Constraints +**Available Resources:** +- GTX 1660 (6GB VRAM): Currently runs llama-swap + Soprano TTS +- AMD RX 6800 (16GB VRAM): Currently runs llama-swap-amd + RVC API + +**During Voice Session:** +- **GTX 1660**: Dedicated to Soprano TTS only (no LLM) +- **AMD RX 6800**: Split between RVC API + llama-swap-amd text model + - RVC uses ~4-6GB VRAM + - Text model uses ~5-6GB VRAM + - Total: ~10-12GB (within 16GB limit) + +**Features That Must Be Disabled During Voice Session:** + +Due to resource constraints and to ensure voice chat performance, the following features must be completely disabled while Miku is in a voice channel: + +1. **Vision Model Loading** - Prevents GTX 1660 from loading vision models (keeps TTS running) +2. **Image Generation (ComfyUI)** - Blocks draw commands with custom message +3. **Bipolar Mode Interactions** - Prevents Miku/Evil Miku arguments and dialogues +4. **Profile Picture Switching** - Locks avatar changes during session +5. **Autonomous Engine** - Pauses autonomous message generation +6. **Scheduled Events** - Pauses all scheduled jobs (e.g., Monday videos) +7. **Figurine Notifier** - Pauses figurine availability notifications +8. **Text Channel Inference** - Queues regular text messages for later processing + +**User-Facing Messages for Blocked Features:** +- Image generation: "🎀 I can't draw right now, I'm talking in voice chat! Ask me again after I leave the voice channel." +- Vision requests: "🎀 I can't look at images or videos right now, I'm talking in voice chat! Send it again after I leave." +- Bipolar triggers: *(Silent - no argument starts)* +- Profile changes: *(Silent - no avatar updates)* + +### 3.2 Resource Locking Mechanism + +**Implementation via `VoiceSessionManager` singleton:** + +```python +class VoiceSessionManager: + def __init__(self): + self.active_session = None # VoiceSession instance or None + self.session_lock = asyncio.Lock() + + async def start_session(self, guild_id, voice_channel, text_channel): + """Start voice session with resource locks""" + async with self.session_lock: + if self.active_session: + raise Exception("Voice session already active") + + # 1. Switch to AMD GPU for text inference + await self._switch_to_amd_gpu() + + # 2. Block vision model loading + await self._block_vision_model() + + # 3. Pause text channel inference (queue messages) + await self._pause_text_channels() + + # 4. Disable bipolar mode interactions (Miku/Evil Miku arguments) + await self._disable_bipolar_mode() + + # 5. Disable profile picture switching + await self._disable_profile_picture_switching() + + # 6. Disable image generation (ComfyUI) + await self._disable_image_generation() + + # 7. Pause autonomous engine + await self._pause_autonomous_engine() + + # 8. Pause scheduled events + await self._pause_scheduled_events() + + # 9. Pause figurine notifier + await self._pause_figurine_notifier() + + # 10. Create voice session + self.active_session = VoiceSession(guild_id, voice_channel, text_channel) + await self.active_session.connect() + + async def end_session(self): + """End voice session and release resources""" + async with self.session_lock: + if not self.active_session: + return + + # 1. Disconnect from voice + await self.active_session.disconnect() + + # 2. Resume text channel inference + await self._resume_text_channels() + + # 3. Unblock vision model + await self._unblock_vision_model() + + # 4. Re-enable bipolar mode interactions + await self._enable_bipolar_mode() + + # 5. Re-enable profile picture switching + await self._enable_profile_picture_switching() + + # 6. Re-enable image generation + await self._enable_image_generation() + + # 7. Resume autonomous engine + await self._resume_autonomous_engine() + + # 8. Resume scheduled events + await self._resume_scheduled_events() + + # 9. Resume figurine notifier + await self._resume_figurine_notifier() + + # 10. Restore original GPU (optional) + # Keep AMD for now to avoid extra switching + + self.active_session = None +``` + +### 3.3 Detailed Resource Lock Methods + +**Each resource lock/unlock requires specific implementation:** + +#### 3.3.1 Text Channel Pausing +```python +async def _pause_text_channels(self): + """Queue text messages instead of processing during voice session""" + globals.VOICE_SESSION_ACTIVE = True + globals.TEXT_MESSAGE_QUEUE = [] + logger.info("Text channels paused (messages will be queued)") + +async def _resume_text_channels(self): + """Process queued messages after voice session""" + globals.VOICE_SESSION_ACTIVE = False + queued_count = len(globals.TEXT_MESSAGE_QUEUE) + logger.info(f"Resuming text channels, processing {queued_count} queued messages") + # Process queue in background task + asyncio.create_task(self._process_message_queue()) +``` + +#### 3.3.2 Bipolar Mode Disabling +```python +async def _disable_bipolar_mode(self): + """Prevent Miku/Evil Miku arguments during voice session""" + from utils.bipolar_mode import pause_bipolar_interactions + pause_bipolar_interactions() + logger.info("Bipolar mode interactions disabled") + +async def _enable_bipolar_mode(self): + """Re-enable Miku/Evil Miku arguments after voice session""" + from utils.bipolar_mode import resume_bipolar_interactions + resume_bipolar_interactions() + logger.info("Bipolar mode interactions re-enabled") +``` + +#### 3.3.3 Profile Picture Switching +```python +async def _disable_profile_picture_switching(self): + """Lock profile picture during voice session""" + from utils.profile_picture_manager import profile_picture_manager + profile_picture_manager.lock_switching() + logger.info("Profile picture switching disabled") + +async def _enable_profile_picture_switching(self): + """Unlock profile picture after voice session""" + from utils.profile_picture_manager import profile_picture_manager + profile_picture_manager.unlock_switching() + logger.info("Profile picture switching re-enabled") +``` + +#### 3.3.4 Image Generation Blocking +```python +async def _disable_image_generation(self): + """Block ComfyUI image generation during voice session""" + globals.IMAGE_GENERATION_BLOCKED = True + globals.IMAGE_GENERATION_BLOCK_MESSAGE = ( + "🎀 I can't draw right now, I'm talking in voice chat! " + "Ask me again after I leave the voice channel." + ) + logger.info("Image generation disabled") + +async def _enable_image_generation(self): + """Re-enable image generation after voice session""" + globals.IMAGE_GENERATION_BLOCKED = False + globals.IMAGE_GENERATION_BLOCK_MESSAGE = None + logger.info("Image generation re-enabled") +``` + +#### 3.3.5 Autonomous Engine Pausing +```python +async def _pause_autonomous_engine(self): + """Pause autonomous message generation during voice session""" + from utils.autonomous import pause_autonomous_system + pause_autonomous_system() + logger.info("Autonomous engine paused") + +async def _resume_autonomous_engine(self): + """Resume autonomous message generation after voice session""" + from utils.autonomous import resume_autonomous_system + resume_autonomous_system() + logger.info("Autonomous engine resumed") +``` + +#### 3.3.6 Scheduled Events Pausing +```python +async def _pause_scheduled_events(self): + """Pause all scheduled jobs during voice session""" + globals.scheduler.pause() + logger.info("Scheduled events paused") + +async def _resume_scheduled_events(self): + """Resume scheduled jobs after voice session""" + globals.scheduler.resume() + logger.info("Scheduled events resumed") +``` + +#### 3.3.7 Figurine Notifier Pausing +```python +async def _pause_figurine_notifier(self): + """Pause figurine notifications during voice session""" + # Assuming figurine notifier is a scheduled job + try: + globals.scheduler.pause_job('figurine_notifier') + logger.info("Figurine notifier paused") + except Exception as e: + logger.warning(f"Could not pause figurine notifier: {e}") + +async def _resume_figurine_notifier(self): + """Resume figurine notifications after voice session""" + try: + globals.scheduler.resume_job('figurine_notifier') + logger.info("Figurine notifier resumed") + except Exception as e: + logger.warning(f"Could not resume figurine notifier: {e}") +``` + +#### 3.3.8 Vision Model Blocking +```python +async def _block_vision_model(self): + """Prevent vision model from loading during voice session""" + globals.VISION_MODEL_BLOCKED = True + logger.info("Vision model blocked") + +async def _unblock_vision_model(self): + """Allow vision model to load after voice session""" + globals.VISION_MODEL_BLOCKED = False + logger.info("Vision model unblocked") +``` + +#### 3.3.9 GPU Switching +```python +async def _switch_to_amd_gpu(self): + """Switch text inference to AMD GPU""" + gpu_state_file = os.path.join("memory", "gpu_state.json") + with open(gpu_state_file, "w") as f: + json.dump({"current_gpu": "amd"}, f) + logger.info("Switched to AMD GPU for text inference") +``` + +### 3.4 Feature-Specific Response Handlers + +**Image Generation Request Handler:** +```python +# In bot message handler, before processing image generation +if globals.IMAGE_GENERATION_BLOCKED: + await message.channel.send(globals.IMAGE_GENERATION_BLOCK_MESSAGE) + await message.add_reaction('🎀') + return +``` + +**Vision Model Request Handler:** +```python +# In image/video handling code +if globals.VISION_MODEL_BLOCKED: + await message.channel.send( + "🎀 I can't look at images right now, I'm in voice chat! " + "Send it again after I leave." + ) + return +``` + +**Bipolar Argument Trigger Handler:** +```python +# In bipolar_mode.py trigger detection +from utils.voice_manager import voice_manager + +if voice_manager.active_session: + logger.info("Bipolar argument blocked (voice session active)") + return # Skip argument trigger +``` + +### 3.5 Required Module Modifications + +**The following existing modules need to be updated to check voice session state:** + +#### 3.5.1 bot/utils/bipolar_mode.py +**Add checks before:** +- Argument triggers based on score thresholds +- Persona dialogue initiations +- Any webhook-based Miku/Evil Miku interactions + +```python +# At the top of functions that trigger bipolar interactions +from utils.voice_manager import voice_manager + +if voice_manager.active_session: + logger.debug("Bipolar interaction blocked (voice session active)") + return +``` + +#### 3.5.2 bot/utils/profile_picture_manager.py +**Add locking mechanism:** +```python +class ProfilePictureManager: + def __init__(self): + self.switching_locked = False + + def lock_switching(self): + """Lock profile picture changes during voice session""" + self.switching_locked = True + + def unlock_switching(self): + """Unlock profile picture changes after voice session""" + self.switching_locked = False + + async def update_profile_picture(self, ...): + if self.switching_locked: + logger.info("Profile picture change blocked (voice session active)") + return + # ... normal update logic ... +``` + +#### 3.5.3 bot/utils/autonomous.py +**Add pause/resume functions:** +```python +_autonomous_paused = False + +def pause_autonomous_system(): + """Pause autonomous message generation""" + global _autonomous_paused + _autonomous_paused = True + +def resume_autonomous_system(): + """Resume autonomous message generation""" + global _autonomous_paused + _autonomous_paused = False + +# In autonomous trigger functions: +def should_send_autonomous_message(): + if _autonomous_paused: + return False + # ... normal logic ... +``` + +#### 3.5.4 bot/bot.py (main message handler) +**Add checks for image generation and vision:** +```python +@globals.client.event +async def on_message(message): + # ... existing checks ... + + # Check if image generation is blocked + if "draw" in message.content.lower() and globals.IMAGE_GENERATION_BLOCKED: + await message.channel.send(globals.IMAGE_GENERATION_BLOCK_MESSAGE) + await message.add_reaction('🎀') + return + + # Check if vision model is blocked (images/videos) + if (message.attachments or message.embeds) and globals.VISION_MODEL_BLOCKED: + await message.channel.send( + "🎀 I can't look at images or videos right now, I'm talking in voice chat! " + "Send it again after I leave." + ) + return + + # ... rest of message handling ... +``` + +#### 3.5.5 bot/commands/actions.py (ComfyUI integration) +**Add check before image generation:** +```python +async def handle_image_generation(message, prompt): + if globals.IMAGE_GENERATION_BLOCKED: + await message.channel.send(globals.IMAGE_GENERATION_BLOCK_MESSAGE) + await message.add_reaction('🎀') + return + + # ... normal image generation logic ... +``` + +### 3.6 Text Channel Pausing Strategy + +**Options:** + +**Option A: Queue Messages (Recommended)** +- Store incoming messages in a queue during voice session +- Process queue after voice session ends +- Pros: No messages lost, users get responses eventually +- Cons: Responses delayed until voice session ends + +**Option B: Ignore Messages** +- Simply don't respond to text channels during voice session +- Send status message: "🎀 Miku is currently in voice chat..." +- Pros: Simple, clear behavior +- Cons: Users might think bot is broken + +**Recommendation: Option A with status indicator** +- Queue messages with timestamps +- Set bot status to "🎀 In Voice Chat" +- Process queue in order after session ends + +--- + +## 4. Technical Implementation Details + +### 4.1 Discord Voice Integration + +**Required Package:** +```bash +pip install PyNaCl # Required for voice support +``` + +**Voice Connection Flow:** +```python +import discord + +# Connect to voice channel +voice_channel = client.get_channel(voice_channel_id) +voice_client = await voice_channel.connect() + +# Create audio source from stream +audio_source = VoiceStreamSource(websocket_url) + +# Play audio +voice_client.play(audio_source) + +# Disconnect +await voice_client.disconnect() +``` + +**Key Classes:** +- `discord.VoiceClient` - Handles voice connection +- `discord.AudioSource` - Abstract base for audio streaming +- `discord.PCMAudio` - Raw PCM audio source (16-bit, 48kHz, stereo) + +### 4.2 Custom Audio Source for TTS Stream + +**Implementation:** +```python +class MikuVoiceSource(discord.AudioSource): + """ + Streams audio from RVC WebSocket to Discord voice channel. + """ + def __init__(self, websocket_url="ws://localhost:8765/ws/stream"): + self.websocket_url = websocket_url + self.ws = None + self.audio_queue = asyncio.Queue(maxsize=100) + self.running = False + self.frame_size = 3840 # 20ms @ 48kHz stereo (960 samples * 2 channels * 2 bytes) + + async def connect(self): + """Connect to TTS WebSocket""" + self.ws = await websockets.connect(self.websocket_url) + self.running = True + asyncio.create_task(self._receive_audio()) + + async def _receive_audio(self): + """Receive audio from WebSocket and queue for playback""" + while self.running: + try: + audio_bytes = await self.ws.recv() + # Convert float32 mono to int16 stereo + audio_data = self._process_audio(audio_bytes) + await self.audio_queue.put(audio_data) + except Exception as e: + logger.error(f"Audio receive error: {e}") + break + + def _process_audio(self, audio_bytes): + """ + Convert float32 mono @ 48kHz to int16 stereo @ 48kHz for Discord. + Discord expects: 16-bit PCM, 48kHz, stereo + """ + # Decode float32 + audio = np.frombuffer(audio_bytes, dtype=np.float32) + + # Convert to int16 + audio_int16 = (audio * 32767).clip(-32768, 32767).astype(np.int16) + + # Convert mono to stereo (duplicate channel) + audio_stereo = np.repeat(audio_int16, 2) + + return audio_stereo.tobytes() + + def read(self): + """ + Called by Discord.py to get next audio frame (20ms). + Must be synchronous and return exactly 3840 bytes. + """ + try: + # Get from queue (non-blocking) + audio_chunk = self.audio_queue.get_nowait() + + # Ensure exactly frame_size bytes + if len(audio_chunk) < self.frame_size: + # Pad with silence + audio_chunk += b'\x00' * (self.frame_size - len(audio_chunk)) + elif len(audio_chunk) > self.frame_size: + # Trim excess + audio_chunk = audio_chunk[:self.frame_size] + + return audio_chunk + except: + # No audio available, return silence + return b'\x00' * self.frame_size + + def cleanup(self): + """Clean up resources""" + self.running = False + if self.ws: + asyncio.create_task(self.ws.close()) +``` + +### 4.3 LLM Streaming Integration + +**Llama.cpp Streaming Endpoint:** +``` +POST http://llama-swap-amd:8080/v1/models/llama3.1/completions +Content-Type: application/json + +{ + "prompt": "", + "stream": true, + "temperature": 0.8, + "max_tokens": 500 +} +``` + +**Response (SSE - Server-Sent Events):** +``` +data: {"choices":[{"text":"Hello","finish_reason":null}]} + +data: {"choices":[{"text":" world","finish_reason":null}]} + +data: {"choices":[{"text":"!","finish_reason":"stop"}]} + +data: [DONE] +``` + +**Streaming Handler:** +```python +async def stream_llm_to_tts(prompt, websocket): + """ + Stream LLM tokens directly to TTS WebSocket. + """ + url = f"{globals.LLAMA_AMD_URL}/v1/models/{globals.TEXT_MODEL}/completions" + + payload = { + "prompt": prompt, + "stream": True, + "temperature": 0.8, + "max_tokens": 500, + "stop": ["\n\n", "User:", "Assistant:"] + } + + async with aiohttp.ClientSession() as session: + async with session.post(url, json=payload) as resp: + async for line in resp.content: + line = line.decode('utf-8').strip() + + if not line.startswith('data: '): + continue + + data = line[6:] # Remove 'data: ' prefix + + if data == '[DONE]': + # Flush remaining buffer + await websocket.send(json.dumps({"flush": True})) + break + + try: + chunk = json.loads(data) + token = chunk['choices'][0]['text'] + + # Send token to TTS + await websocket.send(json.dumps({ + "token": token, + "pitch_shift": 0 + })) + + except Exception as e: + logger.error(f"Token parse error: {e}") +``` + +### 4.4 Voice Session Implementation + +**Main Session Class:** +```python +class VoiceSession: + """ + Manages a single voice chat session. + """ + def __init__(self, guild_id, voice_channel, text_channel): + self.guild_id = guild_id + self.voice_channel = voice_channel + self.text_channel = text_channel + self.voice_client = None + self.audio_source = None + self.tts_websocket = None + self.active = False + + async def connect(self): + """Connect to voice channel and TTS pipeline""" + # 1. Connect to Discord voice + self.voice_client = await self.voice_channel.connect() + + # 2. Connect to TTS WebSocket + self.tts_websocket = await websockets.connect("ws://localhost:8765/ws/stream") + + # 3. Create audio source + self.audio_source = MikuVoiceSource() + await self.audio_source.connect() + + # 4. Start playing audio stream + self.voice_client.play(self.audio_source) + + self.active = True + logger.info(f"Voice session started in {self.voice_channel.name}") + + async def speak(self, prompt): + """ + Generate speech for given prompt. + Streams LLM tokens β†’ TTS β†’ Discord voice. + """ + if not self.active: + raise Exception("Voice session not active") + + # Build full LLM prompt with context + full_prompt = await self._build_llm_prompt(prompt) + + # Stream tokens to TTS + await stream_llm_to_tts(full_prompt, self.tts_websocket) + + async def _build_llm_prompt(self, user_prompt): + """Build full prompt with context (similar to query_llama)""" + # Get mood and context + from utils.llm import get_context_for_response_type + from server_manager import server_manager + + server_config = server_manager.get_server_config(self.guild_id) + current_mood = server_config.current_mood_description + + miku_context = get_context_for_response_type("server_response") + + # Build messages array + messages = [ + {"role": "system", "content": f"{miku_context}\n\nMiku is currently feeling: {current_mood}"}, + {"role": "user", "content": user_prompt} + ] + + # Convert to llama.cpp prompt format (depends on model) + # For Llama 3.1: + prompt = "<|begin_of_text|>" + for msg in messages: + if msg["role"] == "system": + prompt += f"<|start_header_id|>system<|end_header_id|>\n{msg['content']}<|eot_id|>" + elif msg["role"] == "user": + prompt += f"<|start_header_id|>user<|end_header_id|>\n{msg['content']}<|eot_id|>" + prompt += "<|start_header_id|>assistant<|end_header_id|>\n" + + return prompt + + async def disconnect(self): + """Disconnect from voice and cleanup""" + self.active = False + + # Stop audio playback + if self.voice_client and self.voice_client.is_playing(): + self.voice_client.stop() + + # Disconnect from voice + if self.voice_client: + await self.voice_client.disconnect() + + # Close TTS WebSocket + if self.tts_websocket: + await self.tts_websocket.close() + + # Cleanup audio source + if self.audio_source: + self.audio_source.cleanup() + + logger.info("Voice session ended") +``` + +--- + +## 5. Command Implementation + +### 5.1 Voice Commands + +**New commands to add:** + +1. **`!miku join [#voice-channel]`** + - Join specified voice channel (or user's current channel) + - Set text channel as prompt input channel + - Lock resources and start session + +2. **`!miku leave`** + - Leave current voice channel + - Release resources + - Resume normal operation + +3. **`!miku voice-status`** + - Show current voice session status + - Show active prompt channel + - Show resource allocation + +### 5.2 Command Router Integration + +**Add to `bot/command_router.py`:** +```python +from commands.voice import handle_voice_command + +# In route_command(): +if cmd in ['join', 'leave', 'voice-status', 'speak']: + return await handle_voice_command(message, cmd, args) +``` + +**New file `bot/commands/voice.py`:** +```python +from utils.voice_manager import voice_manager + +async def handle_voice_command(message, cmd, args): + """Handle voice-related commands""" + + if cmd == 'join': + # Get voice channel + if args and args[0].startswith('<#'): + # Channel mentioned + channel_id = int(args[0][2:-1]) + voice_channel = message.guild.get_channel(channel_id) + else: + # Use user's current voice channel + if message.author.voice: + voice_channel = message.author.voice.channel + else: + await message.channel.send("❌ You must be in a voice channel!") + return + + try: + await voice_manager.start_session( + message.guild.id, + voice_channel, + message.channel # Use current text channel for prompts + ) + await message.channel.send(f"🎀 Joined {voice_channel.name}! Send messages here to make me speak.") + except Exception as e: + await message.channel.send(f"❌ Failed to join voice: {e}") + + elif cmd == 'leave': + if not voice_manager.active_session: + await message.channel.send("❌ I'm not in a voice channel!") + return + + await voice_manager.end_session() + await message.channel.send("πŸ‘‹ Left voice channel!") + + elif cmd == 'voice-status': + if voice_manager.active_session: + session = voice_manager.active_session + await message.channel.send( + f"🎀 **Voice Session Active**\n" + f"Voice Channel: {session.voice_channel.name}\n" + f"Prompt Channel: {session.text_channel.mention}\n" + f"GPU: AMD RX 6800 (text only)\n" + f"Text Channels: Paused (queued)" + ) + else: + await message.channel.send("No active voice session") +``` + +### 5.3 Text Channel Prompt Handler + +**Modify `bot/bot.py` `on_message` handler:** +```python +@globals.client.event +async def on_message(message): + if message.author == globals.client.user: + return + + # Check if this is voice prompt channel + if voice_manager.active_session: + session = voice_manager.active_session + if message.channel.id == session.text_channel.id: + # This is a voice prompt + await session.speak(message.content) + await message.add_reaction('🎀') # Acknowledge + return + + # ... rest of normal message handling ... +``` + +--- + +## 6. Implementation Phases + +### Phase 1: Foundation (3-4 hours) +**Goal:** Set up basic voice connection and resource management + +**Tasks:** +1. Install PyNaCl dependency +2. Add global state variables to `globals.py` +3. Create `bot/utils/voice_manager.py` +4. Implement `VoiceSessionManager` singleton +5. Implement all resource locking methods: + - GPU switching + - Vision model blocking + - Text channel pausing + - Bipolar mode disabling + - Profile picture lock + - Image generation blocking + - Autonomous engine pause + - Scheduled events pause + - Figurine notifier pause +6. Add feature-specific response handlers (image gen, vision model) +7. Test voice connection without TTS + +**Deliverables:** +- Voice channel join/leave working +- All resource locks functional +- Text channels properly paused during session +- All features properly disabled/re-enabled around sessions +- Hardcoded responses for blocked features + +### Phase 2: Audio Streaming (3-4 hours) +**Goal:** Implement TTS audio streaming to Discord + +**Tasks:** +1. Create `MikuVoiceSource` class +2. Implement WebSocket β†’ Discord audio bridge +3. Handle audio format conversion (float32 mono β†’ int16 stereo) +4. Implement frame buffering and timing +5. Test with static text (no LLM streaming yet) + +**Deliverables:** +- Audio plays in Discord voice channel +- TTS pipeline outputs correctly formatted audio +- No audio glitches or timing issues + +### Phase 3: LLM Streaming Integration (2-3 hours) +**Goal:** Connect LLM token stream to TTS pipeline + +**Tasks:** +1. Implement `stream_llm_to_tts()` function +2. Handle SSE parsing from llama.cpp +3. Build proper prompt with context/mood +4. Test token-by-token streaming +5. Handle edge cases (connection drops, errors) + +**Deliverables:** +- LLM tokens stream to TTS in real-time +- Audio starts playing quickly (1-2s latency) +- Natural sentence boundaries respected + +### Phase 4: Commands & UX (1-2 hours) +**Goal:** Polish user interface and commands + +**Tasks:** +1. Create `bot/commands/voice.py` +2. Add commands to router +3. Implement status messages +4. Add error handling and user feedback +5. Test edge cases (multiple join attempts, etc.) + +**Deliverables:** +- All voice commands working +- Clear user feedback +- Graceful error handling + +### Phase 5: Testing & Refinement (2-3 hours) +**Goal:** Ensure stability and performance + +**Tasks:** +1. Load testing (long sessions, many prompts) +2. Resource leak detection +3. Audio quality verification +4. Latency optimization +5. Documentation and README updates + +**Deliverables:** +- Stable voice sessions (no crashes) +- Optimal latency (target: <2s first audio) +- Updated documentation + +**Total Estimated Time: 12-18 hours** + +--- + +## 7. Error Handling & Edge Cases + +### 7.1 Common Error Scenarios + +**1. TTS Pipeline Unavailable** +- Symptom: Can't connect to WebSocket +- Response: Return error, don't start voice session +- Message: "❌ TTS pipeline not available. Check soprano/rvc containers." + +**2. Voice Channel Full** +- Symptom: Can't join voice channel (user limit) +- Response: Return error +- Message: "❌ Voice channel is full!" + +**3. Already in Voice Session** +- Symptom: User tries to join while session active +- Response: Reject command +- Message: "❌ Already in a voice session! Use `!miku leave` first." + +**4. LLM Timeout** +- Symptom: LLM doesn't respond within timeout +- Response: Send silence, log error +- Message: "(in voice) *Miku seems confused...*" + +**5. Audio Buffer Underrun** +- Symptom: TTS slower than playback rate +- Response: Pad with silence, don't crash +- Log: Warning about buffer underrun + +**6. Blocked Feature Attempted During Voice** +- Symptom: User tries to generate image, send image, trigger bipolar mode +- Response: Send appropriate blocked feature message +- Examples: + - "🎀 I can't draw right now, I'm talking in voice chat!" + - "🎀 I can't look at images right now, I'm talking in voice chat!" +- Log: Feature block triggered + +**7. Resource Cleanup Failure** +- Symptom: Feature doesn't resume after voice session +- Response: Log error, attempt manual cleanup +- Fallback: Restart bot if critical features stuck + +### 7.2 Graceful Degradation + +**Priority Order:** +1. Keep bot online (don't crash) +2. Maintain voice connection if possible +3. Inform user of issues +4. Fallback to text if voice fails + +**Fallback Strategy:** +```python +async def speak_with_fallback(session, prompt): + """Speak in voice, fallback to text if error""" + try: + await session.speak(prompt) + except Exception as e: + logger.error(f"Voice speak failed: {e}") + # Fallback: send text response + response = await query_llama(prompt, ...) + await session.text_channel.send(f"⚠️ (Voice failed, text mode): {response}") +``` + +--- + +## 8. Performance Optimization + +### 8.1 Latency Reduction Strategies + +**Target: <2 seconds from prompt to first audio** + +**Optimization Points:** +1. **Pre-warm TTS connection** + - Keep WebSocket connected during session + - Reduce handshake overhead + +2. **Reduce LLM prompt length** + - Limit conversation history to 4 messages + - Truncate long context + +3. **Parallel processing** + - Start TTS as soon as first token arrives + - Don't wait for full sentence + +4. **Buffer tuning** + - Keep audio buffer small (5-10 chunks max) + - Balance latency vs. smoothness + +### 8.2 Resource Monitoring + +**Metrics to Track:** +- VRAM usage (AMD GPU during session) +- CPU usage (RVC/Soprano processing) +- Audio buffer fill level +- LLM token rate (tokens/second) +- End-to-end latency (prompt β†’ audio) + +**Implementation:** +```python +class VoiceMetrics: + def __init__(self): + self.prompt_times = [] + self.first_audio_times = [] + self.total_tokens = 0 + + def log_prompt(self): + self.prompt_times.append(time.time()) + + def log_first_audio(self): + if self.prompt_times: + latency = time.time() - self.prompt_times[-1] + self.first_audio_times.append(latency) + logger.info(f"First audio latency: {latency:.2f}s") +``` + +--- + +## 9. Testing Plan + +### 9.1 Unit Tests + +**Components to Test:** +1. `MikuVoiceSource.read()` - Audio framing +2. `stream_llm_to_tts()` - Token streaming +3. `VoiceSessionManager` - Resource locking +4. Audio format conversion + +### 9.2 Integration Tests + +**Test Scenarios:** +1. Full voice session lifecycle (join β†’ speak β†’ leave) +2. Resource cleanup after session +3. Text channel pause/resume +4. Multiple prompts in quick succession +5. Long prompts (500+ characters) +6. Error recovery (TTS crash, LLM timeout) + +### 9.3 Feature Blocking Tests + +**Test each blocked feature during voice session:** + +1. **Vision Model Blocking** + - βœ… Send image to Miku while in voice + - βœ… Verify blocked message appears + - βœ… Confirm no vision model loaded (check logs) + - βœ… After leaving voice, send image again + - βœ… Verify vision model works normally + +2. **Image Generation Blocking** + - βœ… Try "draw [prompt]" while in voice + - βœ… Verify custom blocked message appears + - βœ… Confirm ComfyUI not called + - βœ… After leaving voice, try draw again + - βœ… Verify image generation works normally + +3. **Bipolar Mode Blocking** + - βœ… Trigger bipolar argument score while in voice + - βœ… Verify no argument starts + - βœ… Check logs for block message + - βœ… After leaving voice, verify bipolar mode resumes + +4. **Profile Picture Blocking** + - βœ… Trigger profile picture change while in voice + - βœ… Verify avatar doesn't change + - βœ… After leaving voice, verify pfp switching works + +5. **Autonomous Engine Blocking** + - βœ… Wait for autonomous message trigger while in voice + - βœ… Verify no autonomous messages sent + - βœ… After leaving voice, verify autonomous resumes + +6. **Scheduled Events Blocking** + - βœ… Join voice near scheduled event time + - βœ… Verify event doesn't fire during session + - βœ… After leaving voice, verify scheduler active + +7. **Text Channel Queuing** + - βœ… Send regular message while in voice + - βœ… Verify no response during session + - βœ… Verify message queued (check logs) + - βœ… After leaving voice, verify queued messages processed + +### 9.4 Manual Testing Checklist + +- [ ] Join voice channel via command +- [ ] Bot appears in voice channel +- [ ] Send prompt in text channel +- [ ] Audio plays in voice channel within 2s +- [ ] Audio quality is clear (no glitches) +- [ ] Multiple prompts work in sequence +- [ ] Leave command works +- [ ] Resources released after leave +- [ ] Text channels resume normal operation +- [ ] Bot can rejoin after leaving + +--- + +## 10. Future Enhancements (Post-MVP) + +### 10.1 Speech-to-Text (STT) Integration +**Goal:** Allow users to speak to Miku instead of typing + +**Approach:** +- Use Whisper model for STT +- Run on AMD GPU during voice sessions +- Stream audio β†’ text β†’ LLM β†’ TTS β†’ audio + +### 10.2 Multi-User Voice Conversations +**Goal:** Multiple users can take turns speaking + +**Approach:** +- Voice activity detection (VAD) +- Queue speaker turns +- Name prefixes in prompts ("User1: ...", "User2: ...") + +### 10.3 Background Music/Sound Effects +**Goal:** Play background music while Miku speaks + +**Approach:** +- Mix audio streams (voice + music) +- Volume ducking (lower music during speech) +- FFmpeg audio processing + +### 10.4 Voice Commands +**Goal:** Control bot via voice ("Miku, leave voice chat") + +**Approach:** +- Simple keyword detection in STT output +- Command routing from voice input + +### 10.5 Emotion-Aware Speech +**Goal:** Vary TTS pitch/speed based on mood + +**Approach:** +- Map mood β†’ pitch_shift parameter +- Dynamic pitch based on emotion detection + +--- + +## 11. Configuration & Deployment + +### 11.1 Environment Variables + +**Add to `docker-compose.yml`:** +```yaml +miku-bot: + environment: + - VOICE_ENABLED=true + - TTS_WEBSOCKET_URL=ws://miku-rvc-api:8765/ws/stream + - VOICE_GPU=amd # Force AMD GPU during voice sessions +``` + +### 11.2 Network Configuration + +**Ensure containers can communicate:** +- `miku-bot` β†’ `miku-soprano-tts` (ZMQ 5555) +- `miku-bot` β†’ `miku-rvc-api` (HTTP/WS 8765) +- `miku-bot` β†’ `llama-swap-amd` (HTTP 8080) + +**Add to docker-compose networks if needed:** +```yaml +networks: + miku-network: + name: miku-network + driver: bridge +``` + +### 11.3 Dependencies + +**Add to `bot/requirements.txt`:** +``` +PyNaCl>=1.5.0 # Voice support +websockets>=12.0 # TTS WebSocket client +``` + +### 11.4 Global State Variables + +**Add to `bot/globals.py`:** +```python +# Voice Chat Session State +VOICE_SESSION_ACTIVE = False +TEXT_MESSAGE_QUEUE = [] # Queue for messages received during voice session + +# Feature Blocking Flags (set during voice session) +VISION_MODEL_BLOCKED = False +IMAGE_GENERATION_BLOCKED = False +IMAGE_GENERATION_BLOCK_MESSAGE = None +``` + +--- + +## 12. Risk Assessment + +### 12.1 Technical Risks + +**High Risk:** +- **Audio glitches/stuttering** - Mitigation: Extensive buffer testing +- **Resource exhaustion** - Mitigation: Strict resource locking +- **TTS pipeline crashes** - Mitigation: Health checks, auto-restart + +**Medium Risk:** +- **High latency** - Mitigation: Optimization, parallel processing +- **Connection drops** - Mitigation: Retry logic, graceful degradation + +**Low Risk:** +- **Command conflicts** - Mitigation: Clear command names +- **User confusion** - Mitigation: Status messages, documentation + +### 12.2 Resource Risks + +**Concern:** AMD GPU overload (RVC + LLM simultaneously) + +**Mitigation:** +1. Monitor VRAM usage during testing +2. Reduce RVC batch size if needed +3. Consider limiting LLM context length +4. Add VRAM threshold checks + +**Concern:** Text channel message queue overflow + +**Mitigation:** +1. Limit queue size (e.g., 100 messages) +2. Discard oldest messages if limit reached +3. Send warning to users + +--- + +## 13. Documentation Requirements + +### 13.1 User Documentation + +**Create `VOICE_CHAT_USER_GUIDE.md`:** +- How to invite Miku to voice channel +- How to send prompts +- Troubleshooting common issues +- Feature limitations + +### 13.2 Developer Documentation + +**Create `VOICE_CHAT_DEVELOPER_GUIDE.md`:** +- Architecture overview +- Code organization +- Adding new voice features +- Debugging tips + +### 13.3 API Documentation + +**Document in `API_REFERENCE.md`:** +- Voice command endpoints +- VoiceSessionManager API +- MikuVoiceSource interface + +--- + +## 14. Success Criteria + +### 14.1 Functional Requirements βœ“ +- [x] Miku can join voice channel +- [x] Miku can speak using TTS pipeline +- [x] LLM tokens stream in real-time +- [x] Text prompts trigger voice responses +- [x] Resource management prevents conflicts +- [x] Graceful session cleanup + +### 14.2 Resource Management Requirements +- [ ] GPU switches to AMD during session +- [ ] Vision model blocked during session +- [ ] Text channels paused (messages queued) +- [ ] Bipolar mode interactions disabled +- [ ] Profile picture switching locked +- [ ] Image generation blocked with custom message +- [ ] Autonomous engine paused +- [ ] Scheduled events paused +- [ ] Figurine notifier paused +- [ ] All features resume after session ends + +### 14.3 Performance Requirements +- [ ] First audio within 2 seconds of prompt +- [ ] Audio quality: Clear, no glitches +- [ ] VRAM usage: <14GB on AMD GPU +- [ ] Sessions stable for 30+ minutes + +### 14.4 Usability Requirements +- [ ] Commands intuitive and documented +- [ ] Error messages clear and actionable +- [ ] Status indicators show session state +- [ ] Fallback to text on voice failure +- [ ] Helpful blocked feature messages + +--- + +## 15. Next Steps + +### Immediate Actions: +1. **Review this plan** with team/stakeholders +2. **Set up development branch** (`feature/voice-chat`) +3. **Install dependencies** (PyNaCl, test WebSocket connectivity) +4. **Create skeleton files** (voice_manager.py, voice.py commands) +5. **Start Phase 1 implementation** + +### Before Starting Implementation: +1. Verify soprano/rvc containers are healthy +2. Test WebSocket endpoint manually (`websocket_client_example.py`) +3. Verify AMD GPU has sufficient VRAM (check with `rocm-smi`) +4. Back up current bot state (in case rollback needed) + +### Development Workflow: +1. Create feature branch +2. Implement phase-by-phase +3. Test each phase before moving to next +4. Document changes in commit messages +5. Merge to main when MVP complete + +--- + +## Appendix A: File Structure + +``` +miku-discord/ +β”œβ”€β”€ bot/ +β”‚ β”œβ”€β”€ commands/ +β”‚ β”‚ β”œβ”€β”€ actions.py +β”‚ β”‚ └── voice.py # NEW: Voice commands +β”‚ β”œβ”€β”€ utils/ +β”‚ β”‚ β”œβ”€β”€ llm.py # MODIFY: Add streaming support +β”‚ β”‚ β”œβ”€β”€ voice_manager.py # NEW: Session management +β”‚ β”‚ └── voice_stream.py # NEW: Audio streaming classes +β”‚ β”œβ”€β”€ bot.py # MODIFY: Add voice prompt handling +β”‚ └── command_router.py # MODIFY: Add voice command routing +β”œβ”€β”€ soprano_to_rvc/ +β”‚ β”œβ”€β”€ soprano_rvc_api.py # EXISTING: WebSocket endpoint +β”‚ └── docker-compose.yml # EXISTING: TTS containers +β”œβ”€β”€ docker-compose.yml # MODIFY: Add environment vars +└── readmes/ + β”œβ”€β”€ VOICE_CHAT_IMPLEMENTATION_PLAN.md # THIS FILE + β”œβ”€β”€ VOICE_CHAT_USER_GUIDE.md # NEW: User docs + └── VOICE_CHAT_DEVELOPER_GUIDE.md # NEW: Dev docs +``` + +--- + +## Appendix B: Key Code Snippets + +### B.1 Streaming LLM Tokens +```python +async def stream_llm_tokens(prompt): + """Stream tokens from llama-swap-amd""" + url = f"{globals.LLAMA_AMD_URL}/v1/models/{globals.TEXT_MODEL}/completions" + async with aiohttp.ClientSession() as session: + async with session.post(url, json={"prompt": prompt, "stream": True}) as resp: + async for line in resp.content: + if line.startswith(b'data: '): + data = json.loads(line[6:]) + if 'choices' in data: + yield data['choices'][0]['text'] +``` + +### B.2 Discord Voice Connection +```python +voice_client = await voice_channel.connect() +audio_source = MikuVoiceSource() +await audio_source.connect() +voice_client.play(audio_source) +``` + +### B.3 Resource Lock Pattern +```python +async with voice_manager.session_lock: + await switch_to_amd_gpu() + await block_vision_model() + await pause_text_channels() + # ... start session ... +``` + +--- + +## Appendix C: Troubleshooting Guide + +### Issue: "TTS pipeline not available" +**Cause:** RVC container not running or WebSocket unreachable +**Fix:** +```bash +cd soprano_to_rvc +docker-compose up -d +docker-compose logs rvc +``` + +### Issue: Audio stuttering/glitching +**Cause:** Buffer underrun (TTS too slow) +**Fix:** Increase audio buffer size in `MikuVoiceSource` + +### Issue: High latency (>5s first audio) +**Cause:** LLM slow to generate tokens +**Fix:** Reduce prompt length, check GPU utilization + +### Issue: Voice session hangs on disconnect +**Cause:** Resource cleanup timeout +**Fix:** Add timeout to disconnect operations + +### Issue: Features not resuming after voice session +**Cause:** Resource unlock methods not called or failed +**Fix:** +```bash +# Check logs for cleanup errors +docker logs miku-bot | grep -i "voice\|resume\|enable" + +# Manual fix: Restart bot to reset all states +docker restart miku-bot +``` + +### Issue: Image generation still works during voice session +**Cause:** Block check not implemented in image gen handler +**Fix:** Add `if globals.IMAGE_GENERATION_BLOCKED` check in `commands/actions.py` + +### Issue: Bipolar argument triggered during voice +**Cause:** Block check missing in bipolar_mode.py +**Fix:** Add `if voice_manager.active_session` check before argument triggers + +--- + +## Appendix D: Quick Reference - Resource Blocks + +**Developer Quick Reference: What to Check Before Each Feature** + +| Feature/Module | Check This Before Running | Global Flag | +|----------------|---------------------------|-------------| +| Vision model loading | `globals.VISION_MODEL_BLOCKED` | `VISION_MODEL_BLOCKED` | +| Image generation | `globals.IMAGE_GENERATION_BLOCKED` | `IMAGE_GENERATION_BLOCKED` | +| Bipolar triggers | `voice_manager.active_session` | N/A (check object) | +| Profile picture | `profile_picture_manager.switching_locked` | N/A (check object) | +| Autonomous msgs | `_autonomous_paused` (in autonomous.py) | N/A (module-level) | +| Scheduled events | N/A (scheduler.pause() called) | N/A (APScheduler) | +| Text channel response | `globals.VOICE_SESSION_ACTIVE` | `VOICE_SESSION_ACTIVE` | + +**Locations to Add Checks:** +- `bot/bot.py` - Main message handler (vision, image gen, text response) +- `bot/utils/bipolar_mode.py` - Argument trigger functions +- `bot/utils/profile_picture_manager.py` - Update functions +- `bot/utils/autonomous.py` - Message generation functions +- `bot/commands/actions.py` - Image generation handler + +--- + +## Conclusion + +This implementation plan provides a comprehensive roadmap for adding voice channel functionality to Miku. The phased approach ensures incremental progress with testing at each stage. The resource management strategy carefully balances competing demands on limited hardware. + +**Key Success Factors:** +1. Strict resource locking prevents conflicts (8 features disabled during voice) +2. Token streaming minimizes latency (<2s target) +3. Graceful error handling ensures stability +4. Clear user feedback improves experience (blocked feature messages) +5. Comprehensive testing covers all edge cases + +**Critical Implementation Points:** +- ⚠️ **Must disable 8 features** during voice: vision, image gen, bipolar, pfp, autonomous, scheduled events, figurine notifier, text channels +- ⚠️ **GPU switching mandatory**: AMD RX 6800 for text, GTX 1660 for TTS only +- ⚠️ **User messaging important**: Clear feedback when features blocked +- ⚠️ **Cleanup critical**: All features must resume properly after session + +**Estimated Timeline:** 12-18 hours for MVP, additional 5-10 hours for polish and testing. + +**Files to Create:** +- `bot/utils/voice_manager.py` (main session management) +- `bot/utils/voice_stream.py` (audio streaming classes) +- `bot/commands/voice.py` (voice commands) + +**Files to Modify:** +- `bot/globals.py` (add voice state flags) +- `bot/bot.py` (add voice prompt handler, blocking checks) +- `bot/command_router.py` (add voice command routing) +- `bot/utils/bipolar_mode.py` (add session checks) +- `bot/utils/profile_picture_manager.py` (add locking) +- `bot/utils/autonomous.py` (add pause/resume) +- `bot/commands/actions.py` (add image gen blocking) +- `bot/requirements.txt` (add PyNaCl, websockets) + +**Ready to proceed?** Review this plan, make any necessary adjustments, and begin Phase 1 implementation.