Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.
This commit is contained in:
@@ -398,6 +398,13 @@ class VoiceSession:
|
||||
# Voice chat conversation history (last 8 exchanges)
|
||||
self.conversation_history = [] # List of {"role": "user"/"assistant", "content": str}
|
||||
|
||||
# Voice call management (for automated calls from web UI)
|
||||
self.call_user_id: Optional[int] = None # User ID that was called
|
||||
self.call_timeout_task: Optional[asyncio.Task] = None # 30min timeout task
|
||||
self.user_has_joined = False # Track if user joined the call
|
||||
self.auto_leave_task: Optional[asyncio.Task] = None # 45s auto-leave task
|
||||
self.user_leave_time: Optional[float] = None # When user left the channel
|
||||
|
||||
logger.info(f"VoiceSession created for {voice_channel.name} in guild {guild_id}")
|
||||
|
||||
async def start_audio_streaming(self):
|
||||
@@ -488,6 +495,57 @@ class VoiceSession:
|
||||
self.voice_receiver = None
|
||||
logger.info("✓ Stopped all listening")
|
||||
|
||||
async def on_user_join(self, user_id: int):
|
||||
"""Called when a user joins the voice channel."""
|
||||
# If this is a voice call and the expected user joined
|
||||
if self.call_user_id and user_id == self.call_user_id:
|
||||
self.user_has_joined = True
|
||||
logger.info(f"✓ Call user {user_id} joined the channel")
|
||||
|
||||
# Cancel timeout task since user joined
|
||||
if self.call_timeout_task:
|
||||
self.call_timeout_task.cancel()
|
||||
self.call_timeout_task = None
|
||||
|
||||
# Cancel auto-leave task if it was running
|
||||
if self.auto_leave_task:
|
||||
self.auto_leave_task.cancel()
|
||||
self.auto_leave_task = None
|
||||
self.user_leave_time = None
|
||||
|
||||
async def on_user_leave(self, user_id: int):
|
||||
"""Called when a user leaves the voice channel."""
|
||||
# If this is the call user leaving
|
||||
if self.call_user_id and user_id == self.call_user_id and self.user_has_joined:
|
||||
import time
|
||||
self.user_leave_time = time.time()
|
||||
logger.info(f"📴 Call user {user_id} left - starting 45s auto-leave timer")
|
||||
|
||||
# Start 45s auto-leave timer
|
||||
self.auto_leave_task = asyncio.create_task(self._auto_leave_after_user_disconnect())
|
||||
|
||||
async def _auto_leave_after_user_disconnect(self):
|
||||
"""Auto-leave 45s after user disconnects."""
|
||||
try:
|
||||
await asyncio.sleep(45)
|
||||
|
||||
logger.info("⏰ 45s timeout reached - auto-leaving voice channel")
|
||||
|
||||
# End the session (will trigger cleanup)
|
||||
from utils.voice_manager import VoiceSessionManager
|
||||
session_manager = VoiceSessionManager()
|
||||
await session_manager.end_session()
|
||||
|
||||
# Stop containers
|
||||
from utils.container_manager import ContainerManager
|
||||
await ContainerManager.stop_voice_containers()
|
||||
|
||||
logger.info("✓ Auto-leave complete")
|
||||
|
||||
except asyncio.CancelledError:
|
||||
# User rejoined, normal operation
|
||||
logger.info("Auto-leave cancelled - user rejoined")
|
||||
|
||||
async def on_user_vad_event(self, user_id: int, event: dict):
|
||||
"""Called when VAD detects speech state change."""
|
||||
event_type = event.get('event')
|
||||
@@ -515,7 +573,10 @@ class VoiceSession:
|
||||
# Get user info for notification
|
||||
user = self.voice_channel.guild.get_member(user_id)
|
||||
user_name = user.name if user else f"User {user_id}"
|
||||
await self.text_channel.send(f"💬 *{user_name} said: \"{text}\" (interrupted but too brief - talk longer to interrupt)*")
|
||||
|
||||
# Only send message if debug mode is on
|
||||
if globals.VOICE_DEBUG_MODE:
|
||||
await self.text_channel.send(f"💬 *{user_name} said: \"{text}\" (interrupted but too brief - talk longer to interrupt)*")
|
||||
return
|
||||
|
||||
logger.info(f"✓ Processing final transcript (miku_speaking={self.miku_speaking})")
|
||||
@@ -530,12 +591,14 @@ class VoiceSession:
|
||||
stop_phrases = ["stop talking", "be quiet", "shut up", "stop speaking", "silence"]
|
||||
if any(phrase in text.lower() for phrase in stop_phrases):
|
||||
logger.info(f"🤫 Stop command detected: {text}")
|
||||
await self.text_channel.send(f"🎤 {user.name}: *\"{text}\"*")
|
||||
await self.text_channel.send(f"🤫 *Miku goes quiet*")
|
||||
if globals.VOICE_DEBUG_MODE:
|
||||
await self.text_channel.send(f"🎤 {user.name}: *\"{text}\"*")
|
||||
await self.text_channel.send(f"🤫 *Miku goes quiet*")
|
||||
return
|
||||
|
||||
# Show what user said
|
||||
await self.text_channel.send(f"🎤 {user.name}: *\"{text}\"*")
|
||||
# Show what user said (only in debug mode)
|
||||
if globals.VOICE_DEBUG_MODE:
|
||||
await self.text_channel.send(f"🎤 {user.name}: *\"{text}\"*")
|
||||
|
||||
# Generate LLM response and speak it
|
||||
await self._generate_voice_response(user, text)
|
||||
@@ -582,14 +645,15 @@ class VoiceSession:
|
||||
logger.info(f"⏸️ Pausing for {self.interruption_silence_duration}s after interruption")
|
||||
await asyncio.sleep(self.interruption_silence_duration)
|
||||
|
||||
# 5. Add interruption marker to conversation history
|
||||
# Add interruption marker to conversation history
|
||||
self.conversation_history.append({
|
||||
"role": "assistant",
|
||||
"content": "[INTERRUPTED - user started speaking]"
|
||||
})
|
||||
|
||||
# Show interruption in chat
|
||||
await self.text_channel.send(f"⚠️ *{user_name} interrupted Miku*")
|
||||
# Show interruption in chat (only in debug mode)
|
||||
if globals.VOICE_DEBUG_MODE:
|
||||
await self.text_channel.send(f"⚠️ *{user_name} interrupted Miku*")
|
||||
|
||||
logger.info(f"✓ Interruption handled, ready for next input")
|
||||
|
||||
@@ -599,8 +663,10 @@ class VoiceSession:
|
||||
Called when VAD-based interruption detection is used.
|
||||
"""
|
||||
await self.on_user_interruption(user_id)
|
||||
user = self.voice_channel.guild.get_member(user_id)
|
||||
await self.text_channel.send(f"⚠️ *{user.name if user else 'User'} interrupted Miku*")
|
||||
# Only show interruption message in debug mode
|
||||
if globals.VOICE_DEBUG_MODE:
|
||||
user = self.voice_channel.guild.get_member(user_id)
|
||||
await self.text_channel.send(f"⚠️ *{user.name if user else 'User'} interrupted Miku*")
|
||||
|
||||
async def _generate_voice_response(self, user: discord.User, text: str):
|
||||
"""
|
||||
@@ -624,13 +690,13 @@ class VoiceSession:
|
||||
self.miku_speaking = True
|
||||
logger.info(f" → miku_speaking is now: {self.miku_speaking}")
|
||||
|
||||
# Show processing
|
||||
await self.text_channel.send(f"💭 *Miku is thinking...*")
|
||||
# Show processing (only in debug mode)
|
||||
if globals.VOICE_DEBUG_MODE:
|
||||
await self.text_channel.send(f"💭 *Miku is thinking...*")
|
||||
|
||||
# Import here to avoid circular imports
|
||||
from utils.llm import get_current_gpu_url
|
||||
import aiohttp
|
||||
import globals
|
||||
|
||||
# Load personality and lore
|
||||
miku_lore = ""
|
||||
@@ -657,8 +723,11 @@ VOICE CHAT CONTEXT:
|
||||
* Stories/explanations: 4-6 sentences when asked for details
|
||||
- Match the user's energy and conversation style
|
||||
- IMPORTANT: Only respond in ENGLISH! The TTS system cannot handle Japanese or other languages well.
|
||||
- IMPORTANT: Do not include emojis in your response! The TTS system cannot handle them well.
|
||||
- IMPORTANT: Do NOT prefix your response with your name (like "Miku:" or "Hatsune Miku:")! Just speak naturally - you're already known to be speaking.
|
||||
- Be expressive and use casual language, but stay in character as Miku
|
||||
- If user says "stop talking" or "be quiet", acknowledge briefly and stop
|
||||
- NOTE: You will automatically disconnect 45 seconds after {user.name} leaves the voice channel, so you can mention this if asked about leaving
|
||||
|
||||
Remember: This is a live voice conversation - be natural, not formulaic!"""
|
||||
|
||||
@@ -742,15 +811,19 @@ Remember: This is a live voice conversation - be natural, not formulaic!"""
|
||||
if self.miku_speaking:
|
||||
await self.audio_source.flush()
|
||||
|
||||
# Add Miku's complete response to history
|
||||
# Filter out self-referential prefixes from response
|
||||
filtered_response = self._filter_name_prefixes(full_response.strip())
|
||||
|
||||
# Add Miku's complete response to history (use filtered version)
|
||||
self.conversation_history.append({
|
||||
"role": "assistant",
|
||||
"content": full_response.strip()
|
||||
"content": filtered_response
|
||||
})
|
||||
|
||||
# Show response
|
||||
await self.text_channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
|
||||
logger.info(f"✓ Voice response complete: {full_response.strip()}")
|
||||
# Show response (only in debug mode)
|
||||
if globals.VOICE_DEBUG_MODE:
|
||||
await self.text_channel.send(f"🎤 Miku: *\"{filtered_response}\"*")
|
||||
logger.info(f"✓ Voice response complete: {filtered_response}")
|
||||
else:
|
||||
# Interrupted - don't add incomplete response to history
|
||||
# (interruption marker already added by on_user_interruption)
|
||||
@@ -763,6 +836,35 @@ Remember: This is a live voice conversation - be natural, not formulaic!"""
|
||||
finally:
|
||||
self.miku_speaking = False
|
||||
|
||||
def _filter_name_prefixes(self, text: str) -> str:
|
||||
"""
|
||||
Filter out self-referential name prefixes from Miku's responses.
|
||||
|
||||
Removes patterns like:
|
||||
- "Miku: rest of text"
|
||||
- "Hatsune Miku: rest of text"
|
||||
- "miku: rest of text" (case insensitive)
|
||||
|
||||
Args:
|
||||
text: Raw response text
|
||||
|
||||
Returns:
|
||||
Filtered text without name prefixes
|
||||
"""
|
||||
import re
|
||||
|
||||
# Pattern matches "Miku:" or "Hatsune Miku:" at the start of the text (case insensitive)
|
||||
# Captures any amount of whitespace after the colon
|
||||
pattern = r'^(?:Hatsune\s+)?Miku:\s*'
|
||||
|
||||
filtered = re.sub(pattern, '', text, flags=re.IGNORECASE)
|
||||
|
||||
# Log if we filtered something
|
||||
if filtered != text:
|
||||
logger.info(f"Filtered name prefix: '{text[:30]}...' -> '{filtered[:30]}...'")
|
||||
|
||||
return filtered
|
||||
|
||||
async def _cancel_tts(self):
|
||||
"""
|
||||
Immediately cancel TTS synthesis and clear all audio buffers.
|
||||
|
||||
Reference in New Issue
Block a user