Phase 3 implemented — Text LLM can now stream to the TTS pipeline with the !miku say command

2026-01-17 00:01:17 +02:00
parent 9943cecdec
commit 3e59e5d2f6
3 changed files with 123 additions and 2 deletions
--- a/bot/commands/voice.py
+++ b/bot/commands/voice.py
@@ -5,8 +5,11 @@ Handles joining, leaving, and status commands for voice chat sessions.
 """

 import discord
+import aiohttp
+import json
 from utils.voice_manager import voice_manager
 from utils.logger import get_logger
+from utils.llm import get_current_gpu_url

 logger = get_logger('voice_commands')

@@ -33,6 +36,9 @@ async def handle_voice_command(message, cmd, args):
    elif cmd == 'test':
        await _handle_test(message, args)
    
+    elif cmd == 'say':
+        await _handle_say(message, args)
+    
    else:
        await message.channel.send(f"❌ Unknown voice command: `{cmd}`")

@@ -263,3 +269,105 @@ async def _handle_test(message, args):
    except Exception as e:
        logger.error(f"Failed to test voice playback: {e}", exc_info=True)
        await message.channel.send(f"❌ Error testing voice: {e}")
+
+
+async def _handle_say(message, args):
+    """
+    Handle !miku say command.
+    Send user message to LLM and speak the response in voice chat.
+    
+    Phase 3: Text → LLM → Voice (STT deferred to later phase)
+    """
+    # Validate args
+    if not args:
+        await message.channel.send("❌ Usage: `!miku say <your message>`")
+        return
+    
+    # Check active voice session
+    session = voice_manager.active_session
+    if not session:
+        await message.channel.send("❌ No active voice session! Use `!miku join` first.")
+        return
+    
+    if not session.audio_source:
+        await message.channel.send("❌ Audio source not connected!")
+        return
+    
+    # Extract user message
+    user_message = " ".join(args)
+    
+    try:
+        # Show processing indicator
+        await message.channel.send(f"💭 Processing: *\"{user_message}\"*")
+        logger.info(f"Voice say: user={message.author.name}, message={user_message}")
+        
+        # Prepare LLM payload (based on query_llama logic)
+        from utils.llm import get_current_gpu_url
+        import globals
+        
+        # Simple system prompt for voice responses
+        system_prompt = """You are Hatsune Miku, the virtual singer. 
+Respond naturally and concisely as Miku would in a voice conversation.
+Keep responses short (1-3 sentences) since they will be spoken aloud."""
+        
+        payload = {
+            "model": globals.TEXT_MODEL,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_message}
+            ],
+            "stream": True,
+            "temperature": 0.8,
+            "max_tokens": 200  # Shorter for voice
+        }
+        
+        headers = {'Content-Type': 'application/json'}
+        llama_url = get_current_gpu_url()
+        
+        logger.info(f"Streaming LLM from {llama_url}")
+        
+        # Stream LLM response and send tokens to TTS
+        async with aiohttp.ClientSession() as http_session:
+            async with http_session.post(
+                f"{llama_url}/v1/chat/completions",
+                json=payload,
+                headers=headers,
+                timeout=aiohttp.ClientTimeout(total=60)
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    raise Exception(f"LLM error {response.status}: {error_text}")
+                
+                # Process streaming response
+                full_response = ""
+                async for line in response.content:
+                    line = line.decode('utf-8').strip()
+                    if line.startswith('data: '):
+                        data_str = line[6:]  # Remove 'data: ' prefix
+                        if data_str == '[DONE]':
+                            break
+                        
+                        try:
+                            data = json.loads(data_str)
+                            if 'choices' in data and len(data['choices']) > 0:
+                                delta = data['choices'][0].get('delta', {})
+                                content = delta.get('content', '')
+                                if content:
+                                    # Send token to TTS
+                                    await session.audio_source.send_token(content)
+                                    full_response += content
+                        except json.JSONDecodeError:
+                            continue
+                
+                # Send flush command to trigger synthesis of remaining tokens
+                await session.audio_source.flush()
+                
+                # Show what Miku said
+                await message.channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
+                logger.info(f"✓ Voice say complete: {full_response.strip()}")
+                await message.add_reaction("✅")
+                
+    except Exception as e:
+        logger.error(f"Voice say failed: {e}", exc_info=True)
+        await message.channel.send(f"❌ Voice say failed: {str(e)}")
+