Phase 3 implemented — Text LLM can now stream to the TTS pipeline with the !miku say command
This commit is contained in:
@@ -125,7 +125,7 @@ async def on_message(message):
|
|||||||
if message.author == globals.client.user:
|
if message.author == globals.client.user:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Check for voice commands first (!miku join, !miku leave, !miku voice-status, !miku test)
|
# Check for voice commands first (!miku join, !miku leave, !miku voice-status, !miku test, !miku say)
|
||||||
if not isinstance(message.channel, discord.DMChannel) and message.content.strip().lower().startswith('!miku '):
|
if not isinstance(message.channel, discord.DMChannel) and message.content.strip().lower().startswith('!miku '):
|
||||||
from commands.voice import handle_voice_command
|
from commands.voice import handle_voice_command
|
||||||
|
|
||||||
@@ -134,7 +134,7 @@ async def on_message(message):
|
|||||||
cmd = parts[1].lower()
|
cmd = parts[1].lower()
|
||||||
args = parts[2:] if len(parts) > 2 else []
|
args = parts[2:] if len(parts) > 2 else []
|
||||||
|
|
||||||
if cmd in ['join', 'leave', 'voice-status', 'test']:
|
if cmd in ['join', 'leave', 'voice-status', 'test', 'say']:
|
||||||
await handle_voice_command(message, cmd, args)
|
await handle_voice_command(message, cmd, args)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -5,8 +5,11 @@ Handles joining, leaving, and status commands for voice chat sessions.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import discord
|
import discord
|
||||||
|
import aiohttp
|
||||||
|
import json
|
||||||
from utils.voice_manager import voice_manager
|
from utils.voice_manager import voice_manager
|
||||||
from utils.logger import get_logger
|
from utils.logger import get_logger
|
||||||
|
from utils.llm import get_current_gpu_url
|
||||||
|
|
||||||
logger = get_logger('voice_commands')
|
logger = get_logger('voice_commands')
|
||||||
|
|
||||||
@@ -33,6 +36,9 @@ async def handle_voice_command(message, cmd, args):
|
|||||||
elif cmd == 'test':
|
elif cmd == 'test':
|
||||||
await _handle_test(message, args)
|
await _handle_test(message, args)
|
||||||
|
|
||||||
|
elif cmd == 'say':
|
||||||
|
await _handle_say(message, args)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
await message.channel.send(f"❌ Unknown voice command: `{cmd}`")
|
await message.channel.send(f"❌ Unknown voice command: `{cmd}`")
|
||||||
|
|
||||||
@@ -263,3 +269,105 @@ async def _handle_test(message, args):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to test voice playback: {e}", exc_info=True)
|
logger.error(f"Failed to test voice playback: {e}", exc_info=True)
|
||||||
await message.channel.send(f"❌ Error testing voice: {e}")
|
await message.channel.send(f"❌ Error testing voice: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
async def _handle_say(message, args):
|
||||||
|
"""
|
||||||
|
Handle !miku say command.
|
||||||
|
Send user message to LLM and speak the response in voice chat.
|
||||||
|
|
||||||
|
Phase 3: Text → LLM → Voice (STT deferred to later phase)
|
||||||
|
"""
|
||||||
|
# Validate args
|
||||||
|
if not args:
|
||||||
|
await message.channel.send("❌ Usage: `!miku say <your message>`")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check active voice session
|
||||||
|
session = voice_manager.active_session
|
||||||
|
if not session:
|
||||||
|
await message.channel.send("❌ No active voice session! Use `!miku join` first.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not session.audio_source:
|
||||||
|
await message.channel.send("❌ Audio source not connected!")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Extract user message
|
||||||
|
user_message = " ".join(args)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Show processing indicator
|
||||||
|
await message.channel.send(f"💭 Processing: *\"{user_message}\"*")
|
||||||
|
logger.info(f"Voice say: user={message.author.name}, message={user_message}")
|
||||||
|
|
||||||
|
# Prepare LLM payload (based on query_llama logic)
|
||||||
|
from utils.llm import get_current_gpu_url
|
||||||
|
import globals
|
||||||
|
|
||||||
|
# Simple system prompt for voice responses
|
||||||
|
system_prompt = """You are Hatsune Miku, the virtual singer.
|
||||||
|
Respond naturally and concisely as Miku would in a voice conversation.
|
||||||
|
Keep responses short (1-3 sentences) since they will be spoken aloud."""
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": globals.TEXT_MODEL,
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": user_message}
|
||||||
|
],
|
||||||
|
"stream": True,
|
||||||
|
"temperature": 0.8,
|
||||||
|
"max_tokens": 200 # Shorter for voice
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {'Content-Type': 'application/json'}
|
||||||
|
llama_url = get_current_gpu_url()
|
||||||
|
|
||||||
|
logger.info(f"Streaming LLM from {llama_url}")
|
||||||
|
|
||||||
|
# Stream LLM response and send tokens to TTS
|
||||||
|
async with aiohttp.ClientSession() as http_session:
|
||||||
|
async with http_session.post(
|
||||||
|
f"{llama_url}/v1/chat/completions",
|
||||||
|
json=payload,
|
||||||
|
headers=headers,
|
||||||
|
timeout=aiohttp.ClientTimeout(total=60)
|
||||||
|
) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
error_text = await response.text()
|
||||||
|
raise Exception(f"LLM error {response.status}: {error_text}")
|
||||||
|
|
||||||
|
# Process streaming response
|
||||||
|
full_response = ""
|
||||||
|
async for line in response.content:
|
||||||
|
line = line.decode('utf-8').strip()
|
||||||
|
if line.startswith('data: '):
|
||||||
|
data_str = line[6:] # Remove 'data: ' prefix
|
||||||
|
if data_str == '[DONE]':
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(data_str)
|
||||||
|
if 'choices' in data and len(data['choices']) > 0:
|
||||||
|
delta = data['choices'][0].get('delta', {})
|
||||||
|
content = delta.get('content', '')
|
||||||
|
if content:
|
||||||
|
# Send token to TTS
|
||||||
|
await session.audio_source.send_token(content)
|
||||||
|
full_response += content
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Send flush command to trigger synthesis of remaining tokens
|
||||||
|
await session.audio_source.flush()
|
||||||
|
|
||||||
|
# Show what Miku said
|
||||||
|
await message.channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
|
||||||
|
logger.info(f"✓ Voice say complete: {full_response.strip()}")
|
||||||
|
await message.add_reaction("✅")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Voice say failed: {e}", exc_info=True)
|
||||||
|
await message.channel.send(f"❌ Voice say failed: {str(e)}")
|
||||||
|
|
||||||
|
|||||||
@@ -281,8 +281,21 @@ class MikuVoiceSource(discord.AudioSource):
|
|||||||
await self.send_token(word + " ", pitch_shift)
|
await self.send_token(word + " ", pitch_shift)
|
||||||
# Small delay to avoid overwhelming the TTS
|
# Small delay to avoid overwhelming the TTS
|
||||||
await asyncio.sleep(0.05)
|
await asyncio.sleep(0.05)
|
||||||
|
|
||||||
|
async def flush(self):
|
||||||
|
"""
|
||||||
|
Send flush command to TTS to trigger synthesis of buffered tokens.
|
||||||
|
This ensures any remaining text in the TTS buffer is synthesized.
|
||||||
|
"""
|
||||||
|
if self.websocket:
|
||||||
|
try:
|
||||||
|
await self.websocket.send_json({"flush": True})
|
||||||
|
logger.debug("Sent flush command to TTS")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to send flush command: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def _receive_audio(self):
|
async def _receive_audio(self):
|
||||||
"""Background task to receive audio from WebSocket and buffer it."""
|
"""Background task to receive audio from WebSocket and buffer it."""
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user