Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.

2026-01-20 23:06:17 +02:00
parent 362108f4b0
commit 2934efba22
31 changed files with 5408 additions and 357 deletions
--- a/bot/utils/voice_audio.py
+++ b/bot/utils/voice_audio.py
@@ -6,6 +6,7 @@ Uses aiohttp for WebSocket communication (compatible with FastAPI).

 import asyncio
 import json
+import re
 import numpy as np
 from typing import Optional
 import discord
@@ -29,6 +30,25 @@ CHANNELS = 2  # Stereo for Discord
 FRAME_LENGTH = 0.02  # 20ms frames
 SAMPLES_PER_FRAME = int(SAMPLE_RATE * FRAME_LENGTH)  # 960 samples

+# Emoji pattern for filtering
+# Covers most emoji ranges including emoticons, symbols, pictographs, etc.
+EMOJI_PATTERN = re.compile(
+    "["
+    "\U0001F600-\U0001F64F"  # emoticons
+    "\U0001F300-\U0001F5FF"  # symbols & pictographs
+    "\U0001F680-\U0001F6FF"  # transport & map symbols
+    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
+    "\U00002702-\U000027B0"  # dingbats
+    "\U000024C2-\U0001F251"  # enclosed characters
+    "\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
+    "\U0001FA00-\U0001FA6F"  # chess symbols
+    "\U0001FA70-\U0001FAFF"  # symbols and pictographs extended-A
+    "\U00002600-\U000026FF"  # miscellaneous symbols
+    "\U00002700-\U000027BF"  # dingbats
+    "]+",
+    flags=re.UNICODE
+)
+

 class MikuVoiceSource(discord.AudioSource):
    """
@@ -38,8 +58,9 @@ class MikuVoiceSource(discord.AudioSource):
    """
    
    def __init__(self):
-        self.websocket_url = "ws://172.25.0.1:8765/ws/stream"
-        self.health_url = "http://172.25.0.1:8765/health"
+        # Use Docker hostname for RVC service (miku-rvc-api is on miku-voice-network)
+        self.websocket_url = "ws://miku-rvc-api:8765/ws/stream"
+        self.health_url = "http://miku-rvc-api:8765/health"
        self.session = None
        self.websocket = None
        self.audio_buffer = bytearray()
@@ -230,11 +251,26 @@ class MikuVoiceSource(discord.AudioSource):
        """
        Send a text token to TTS for voice generation.
        Queues tokens if pipeline is still warming up or connection failed.
+        Filters out emojis to prevent TTS hallucinations.
        
        Args:
            token: Text token to synthesize
            pitch_shift: Pitch adjustment (-12 to +12 semitones)
        """
+        # Filter out emojis from the token (preserve whitespace!)
+        original_token = token
+        token = EMOJI_PATTERN.sub('', token)
+        
+        # If token is now empty or only whitespace after emoji removal, skip it
+        if not token or not token.strip():
+            if original_token != token:
+                logger.debug(f"Skipped token (only emojis): '{original_token}'")
+            return
+        
+        # Log if we filtered out emojis
+        if original_token != token:
+            logger.debug(f"Filtered emojis from token: '{original_token}' -> '{token}'")
+        
        # If not warmed up yet or no connection, queue the token
        if not self.warmed_up or not self.websocket:
            self.token_queue.append((token, pitch_shift))