Phase 3 implemented — Text LLM can now stream to the TTS pipeline with the !miku say command

This commit is contained in:
2026-01-17 00:01:17 +02:00
parent 9943cecdec
commit 3e59e5d2f6
3 changed files with 123 additions and 2 deletions

View File

@@ -125,7 +125,7 @@ async def on_message(message):
if message.author == globals.client.user: if message.author == globals.client.user:
return return
# Check for voice commands first (!miku join, !miku leave, !miku voice-status, !miku test) # Check for voice commands first (!miku join, !miku leave, !miku voice-status, !miku test, !miku say)
if not isinstance(message.channel, discord.DMChannel) and message.content.strip().lower().startswith('!miku '): if not isinstance(message.channel, discord.DMChannel) and message.content.strip().lower().startswith('!miku '):
from commands.voice import handle_voice_command from commands.voice import handle_voice_command
@@ -134,7 +134,7 @@ async def on_message(message):
cmd = parts[1].lower() cmd = parts[1].lower()
args = parts[2:] if len(parts) > 2 else [] args = parts[2:] if len(parts) > 2 else []
if cmd in ['join', 'leave', 'voice-status', 'test']: if cmd in ['join', 'leave', 'voice-status', 'test', 'say']:
await handle_voice_command(message, cmd, args) await handle_voice_command(message, cmd, args)
return return

View File

@@ -5,8 +5,11 @@ Handles joining, leaving, and status commands for voice chat sessions.
""" """
import discord import discord
import aiohttp
import json
from utils.voice_manager import voice_manager from utils.voice_manager import voice_manager
from utils.logger import get_logger from utils.logger import get_logger
from utils.llm import get_current_gpu_url
logger = get_logger('voice_commands') logger = get_logger('voice_commands')
@@ -33,6 +36,9 @@ async def handle_voice_command(message, cmd, args):
elif cmd == 'test': elif cmd == 'test':
await _handle_test(message, args) await _handle_test(message, args)
elif cmd == 'say':
await _handle_say(message, args)
else: else:
await message.channel.send(f"❌ Unknown voice command: `{cmd}`") await message.channel.send(f"❌ Unknown voice command: `{cmd}`")
@@ -263,3 +269,105 @@ async def _handle_test(message, args):
except Exception as e: except Exception as e:
logger.error(f"Failed to test voice playback: {e}", exc_info=True) logger.error(f"Failed to test voice playback: {e}", exc_info=True)
await message.channel.send(f"❌ Error testing voice: {e}") await message.channel.send(f"❌ Error testing voice: {e}")
async def _handle_say(message, args):
"""
Handle !miku say command.
Send user message to LLM and speak the response in voice chat.
Phase 3: Text → LLM → Voice (STT deferred to later phase)
"""
# Validate args
if not args:
await message.channel.send("❌ Usage: `!miku say <your message>`")
return
# Check active voice session
session = voice_manager.active_session
if not session:
await message.channel.send("❌ No active voice session! Use `!miku join` first.")
return
if not session.audio_source:
await message.channel.send("❌ Audio source not connected!")
return
# Extract user message
user_message = " ".join(args)
try:
# Show processing indicator
await message.channel.send(f"💭 Processing: *\"{user_message}\"*")
logger.info(f"Voice say: user={message.author.name}, message={user_message}")
# Prepare LLM payload (based on query_llama logic)
from utils.llm import get_current_gpu_url
import globals
# Simple system prompt for voice responses
system_prompt = """You are Hatsune Miku, the virtual singer.
Respond naturally and concisely as Miku would in a voice conversation.
Keep responses short (1-3 sentences) since they will be spoken aloud."""
payload = {
"model": globals.TEXT_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
],
"stream": True,
"temperature": 0.8,
"max_tokens": 200 # Shorter for voice
}
headers = {'Content-Type': 'application/json'}
llama_url = get_current_gpu_url()
logger.info(f"Streaming LLM from {llama_url}")
# Stream LLM response and send tokens to TTS
async with aiohttp.ClientSession() as http_session:
async with http_session.post(
f"{llama_url}/v1/chat/completions",
json=payload,
headers=headers,
timeout=aiohttp.ClientTimeout(total=60)
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"LLM error {response.status}: {error_text}")
# Process streaming response
full_response = ""
async for line in response.content:
line = line.decode('utf-8').strip()
if line.startswith('data: '):
data_str = line[6:] # Remove 'data: ' prefix
if data_str == '[DONE]':
break
try:
data = json.loads(data_str)
if 'choices' in data and len(data['choices']) > 0:
delta = data['choices'][0].get('delta', {})
content = delta.get('content', '')
if content:
# Send token to TTS
await session.audio_source.send_token(content)
full_response += content
except json.JSONDecodeError:
continue
# Send flush command to trigger synthesis of remaining tokens
await session.audio_source.flush()
# Show what Miku said
await message.channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
logger.info(f"✓ Voice say complete: {full_response.strip()}")
await message.add_reaction("")
except Exception as e:
logger.error(f"Voice say failed: {e}", exc_info=True)
await message.channel.send(f"❌ Voice say failed: {str(e)}")

View File

@@ -282,6 +282,19 @@ class MikuVoiceSource(discord.AudioSource):
# Small delay to avoid overwhelming the TTS # Small delay to avoid overwhelming the TTS
await asyncio.sleep(0.05) await asyncio.sleep(0.05)
async def flush(self):
"""
Send flush command to TTS to trigger synthesis of buffered tokens.
This ensures any remaining text in the TTS buffer is synthesized.
"""
if self.websocket:
try:
await self.websocket.send_json({"flush": True})
logger.debug("Sent flush command to TTS")
except Exception as e:
logger.error(f"Failed to send flush command: {e}")
async def _receive_audio(self): async def _receive_audio(self):
"""Background task to receive audio from WebSocket and buffer it.""" """Background task to receive audio from WebSocket and buffer it."""