Phase 3 implemented — Text LLM can now stream to the TTS pipeline with the !miku say command
This commit is contained in:
@@ -5,8 +5,11 @@ Handles joining, leaving, and status commands for voice chat sessions.
|
||||
"""
|
||||
|
||||
import discord
|
||||
import aiohttp
|
||||
import json
|
||||
from utils.voice_manager import voice_manager
|
||||
from utils.logger import get_logger
|
||||
from utils.llm import get_current_gpu_url
|
||||
|
||||
logger = get_logger('voice_commands')
|
||||
|
||||
@@ -33,6 +36,9 @@ async def handle_voice_command(message, cmd, args):
|
||||
elif cmd == 'test':
|
||||
await _handle_test(message, args)
|
||||
|
||||
elif cmd == 'say':
|
||||
await _handle_say(message, args)
|
||||
|
||||
else:
|
||||
await message.channel.send(f"❌ Unknown voice command: `{cmd}`")
|
||||
|
||||
@@ -263,3 +269,105 @@ async def _handle_test(message, args):
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to test voice playback: {e}", exc_info=True)
|
||||
await message.channel.send(f"❌ Error testing voice: {e}")
|
||||
|
||||
|
||||
async def _handle_say(message, args):
|
||||
"""
|
||||
Handle !miku say command.
|
||||
Send user message to LLM and speak the response in voice chat.
|
||||
|
||||
Phase 3: Text → LLM → Voice (STT deferred to later phase)
|
||||
"""
|
||||
# Validate args
|
||||
if not args:
|
||||
await message.channel.send("❌ Usage: `!miku say <your message>`")
|
||||
return
|
||||
|
||||
# Check active voice session
|
||||
session = voice_manager.active_session
|
||||
if not session:
|
||||
await message.channel.send("❌ No active voice session! Use `!miku join` first.")
|
||||
return
|
||||
|
||||
if not session.audio_source:
|
||||
await message.channel.send("❌ Audio source not connected!")
|
||||
return
|
||||
|
||||
# Extract user message
|
||||
user_message = " ".join(args)
|
||||
|
||||
try:
|
||||
# Show processing indicator
|
||||
await message.channel.send(f"💭 Processing: *\"{user_message}\"*")
|
||||
logger.info(f"Voice say: user={message.author.name}, message={user_message}")
|
||||
|
||||
# Prepare LLM payload (based on query_llama logic)
|
||||
from utils.llm import get_current_gpu_url
|
||||
import globals
|
||||
|
||||
# Simple system prompt for voice responses
|
||||
system_prompt = """You are Hatsune Miku, the virtual singer.
|
||||
Respond naturally and concisely as Miku would in a voice conversation.
|
||||
Keep responses short (1-3 sentences) since they will be spoken aloud."""
|
||||
|
||||
payload = {
|
||||
"model": globals.TEXT_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message}
|
||||
],
|
||||
"stream": True,
|
||||
"temperature": 0.8,
|
||||
"max_tokens": 200 # Shorter for voice
|
||||
}
|
||||
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
llama_url = get_current_gpu_url()
|
||||
|
||||
logger.info(f"Streaming LLM from {llama_url}")
|
||||
|
||||
# Stream LLM response and send tokens to TTS
|
||||
async with aiohttp.ClientSession() as http_session:
|
||||
async with http_session.post(
|
||||
f"{llama_url}/v1/chat/completions",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=60)
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
raise Exception(f"LLM error {response.status}: {error_text}")
|
||||
|
||||
# Process streaming response
|
||||
full_response = ""
|
||||
async for line in response.content:
|
||||
line = line.decode('utf-8').strip()
|
||||
if line.startswith('data: '):
|
||||
data_str = line[6:] # Remove 'data: ' prefix
|
||||
if data_str == '[DONE]':
|
||||
break
|
||||
|
||||
try:
|
||||
data = json.loads(data_str)
|
||||
if 'choices' in data and len(data['choices']) > 0:
|
||||
delta = data['choices'][0].get('delta', {})
|
||||
content = delta.get('content', '')
|
||||
if content:
|
||||
# Send token to TTS
|
||||
await session.audio_source.send_token(content)
|
||||
full_response += content
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Send flush command to trigger synthesis of remaining tokens
|
||||
await session.audio_source.flush()
|
||||
|
||||
# Show what Miku said
|
||||
await message.channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
|
||||
logger.info(f"✓ Voice say complete: {full_response.strip()}")
|
||||
await message.add_reaction("✅")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Voice say failed: {e}", exc_info=True)
|
||||
await message.channel.send(f"❌ Voice say failed: {str(e)}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user