Phase 3 implemented — Text LLM can now stream to the TTS pipeline with the !miku say command

2026-01-17 00:01:17 +02:00
parent 9943cecdec
commit 3e59e5d2f6
3 changed files with 123 additions and 2 deletions
--- a/bot/bot.py
+++ b/bot/bot.py
@@ -125,7 +125,7 @@ async def on_message(message):
    if message.author == globals.client.user:
        return
-    # Check for voice commands first (!miku join, !miku leave, !miku voice-status, !miku test)
+    # Check for voice commands first (!miku join, !miku leave, !miku voice-status, !miku test, !miku say)
    if not isinstance(message.channel, discord.DMChannel) and message.content.strip().lower().startswith('!miku '):
        from commands.voice import handle_voice_command
@@ -134,7 +134,7 @@ async def on_message(message):
            cmd = parts[1].lower()
            args = parts[2:] if len(parts) > 2 else []
-            if cmd in ['join', 'leave', 'voice-status', 'test']:
+            if cmd in ['join', 'leave', 'voice-status', 'test', 'say']:
                await handle_voice_command(message, cmd, args)
                return
--- a/bot/commands/voice.py
+++ b/bot/commands/voice.py
@@ -5,8 +5,11 @@ Handles joining, leaving, and status commands for voice chat sessions.
 """
 import discord
 import aiohttp
 import json
 from utils.voice_manager import voice_manager
 from utils.logger import get_logger
 from utils.llm import get_current_gpu_url
 logger = get_logger('voice_commands')
@@ -33,6 +36,9 @@ async def handle_voice_command(message, cmd, args):
    elif cmd == 'test':
        await _handle_test(message, args)
    elif cmd == 'say':
        await _handle_say(message, args)
    else:
        await message.channel.send(f"❌ Unknown voice command: `{cmd}`")
@@ -263,3 +269,105 @@ async def _handle_test(message, args):
    except Exception as e:
        logger.error(f"Failed to test voice playback: {e}", exc_info=True)
        await message.channel.send(f"❌ Error testing voice: {e}")
 async def _handle_say(message, args):
    """
    Handle !miku say command.
    Send user message to LLM and speak the response in voice chat.
    Phase 3: Text → LLM → Voice (STT deferred to later phase)
    """
    # Validate args
    if not args:
        await message.channel.send("❌ Usage: `!miku say <your message>`")
        return
    # Check active voice session
    session = voice_manager.active_session
    if not session:
        await message.channel.send("❌ No active voice session! Use `!miku join` first.")
        return
    if not session.audio_source:
        await message.channel.send("❌ Audio source not connected!")
        return
    # Extract user message
    user_message = " ".join(args)
    try:
        # Show processing indicator
        await message.channel.send(f"💭 Processing: *\"{user_message}\"*")
        logger.info(f"Voice say: user={message.author.name}, message={user_message}")
        # Prepare LLM payload (based on query_llama logic)
        from utils.llm import get_current_gpu_url
        import globals
        # Simple system prompt for voice responses
        system_prompt = """You are Hatsune Miku, the virtual singer. 
 Respond naturally and concisely as Miku would in a voice conversation.
 Keep responses short (1-3 sentences) since they will be spoken aloud."""
        payload = {
            "model": globals.TEXT_MODEL,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            "stream": True,
            "temperature": 0.8,
            "max_tokens": 200  # Shorter for voice
        }
        headers = {'Content-Type': 'application/json'}
        llama_url = get_current_gpu_url()
        logger.info(f"Streaming LLM from {llama_url}")
        # Stream LLM response and send tokens to TTS
        async with aiohttp.ClientSession() as http_session:
            async with http_session.post(
                f"{llama_url}/v1/chat/completions",
                json=payload,
                headers=headers,
                timeout=aiohttp.ClientTimeout(total=60)
            ) as response:
                if response.status != 200:
                    error_text = await response.text()
                    raise Exception(f"LLM error {response.status}: {error_text}")
                # Process streaming response
                full_response = ""
                async for line in response.content:
                    line = line.decode('utf-8').strip()
                    if line.startswith('data: '):
                        data_str = line[6:]  # Remove 'data: ' prefix
                        if data_str == '[DONE]':
                            break
                        try:
                            data = json.loads(data_str)
                            if 'choices' in data and len(data['choices']) > 0:
                                delta = data['choices'][0].get('delta', {})
                                content = delta.get('content', '')
                                if content:
                                    # Send token to TTS
                                    await session.audio_source.send_token(content)
                                    full_response += content
                        except json.JSONDecodeError:
                            continue
                # Send flush command to trigger synthesis of remaining tokens
                await session.audio_source.flush()
                # Show what Miku said
                await message.channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
                logger.info(f"✓ Voice say complete: {full_response.strip()}")
                await message.add_reaction("✅")
    except Exception as e:
        logger.error(f"Voice say failed: {e}", exc_info=True)
        await message.channel.send(f"❌ Voice say failed: {str(e)}")
--- a/bot/utils/voice_audio.py
+++ b/bot/utils/voice_audio.py
@@ -282,6 +282,19 @@ class MikuVoiceSource(discord.AudioSource):
            # Small delay to avoid overwhelming the TTS
            await asyncio.sleep(0.05)
    async def flush(self):
        """
        Send flush command to TTS to trigger synthesis of buffered tokens.
        This ensures any remaining text in the TTS buffer is synthesized.
        """
        if self.websocket:
            try:
                await self.websocket.send_json({"flush": True})
                logger.debug("Sent flush command to TTS")
            except Exception as e:
                logger.error(f"Failed to send flush command: {e}")
    async def _receive_audio(self):
        """Background task to receive audio from WebSocket and buffer it."""