Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.

This commit is contained in:
2026-01-20 23:06:17 +02:00
parent 362108f4b0
commit 2934efba22
31 changed files with 5408 additions and 357 deletions

View File

@@ -6,6 +6,7 @@ Uses aiohttp for WebSocket communication (compatible with FastAPI).
import asyncio
import json
import re
import numpy as np
from typing import Optional
import discord
@@ -29,6 +30,25 @@ CHANNELS = 2 # Stereo for Discord
FRAME_LENGTH = 0.02 # 20ms frames
SAMPLES_PER_FRAME = int(SAMPLE_RATE * FRAME_LENGTH) # 960 samples
# Emoji pattern for filtering
# Covers most emoji ranges including emoticons, symbols, pictographs, etc.
EMOJI_PATTERN = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U00002702-\U000027B0" # dingbats
"\U000024C2-\U0001F251" # enclosed characters
"\U0001F900-\U0001F9FF" # supplemental symbols and pictographs
"\U0001FA00-\U0001FA6F" # chess symbols
"\U0001FA70-\U0001FAFF" # symbols and pictographs extended-A
"\U00002600-\U000026FF" # miscellaneous symbols
"\U00002700-\U000027BF" # dingbats
"]+",
flags=re.UNICODE
)
class MikuVoiceSource(discord.AudioSource):
"""
@@ -38,8 +58,9 @@ class MikuVoiceSource(discord.AudioSource):
"""
def __init__(self):
self.websocket_url = "ws://172.25.0.1:8765/ws/stream"
self.health_url = "http://172.25.0.1:8765/health"
# Use Docker hostname for RVC service (miku-rvc-api is on miku-voice-network)
self.websocket_url = "ws://miku-rvc-api:8765/ws/stream"
self.health_url = "http://miku-rvc-api:8765/health"
self.session = None
self.websocket = None
self.audio_buffer = bytearray()
@@ -230,11 +251,26 @@ class MikuVoiceSource(discord.AudioSource):
"""
Send a text token to TTS for voice generation.
Queues tokens if pipeline is still warming up or connection failed.
Filters out emojis to prevent TTS hallucinations.
Args:
token: Text token to synthesize
pitch_shift: Pitch adjustment (-12 to +12 semitones)
"""
# Filter out emojis from the token (preserve whitespace!)
original_token = token
token = EMOJI_PATTERN.sub('', token)
# If token is now empty or only whitespace after emoji removal, skip it
if not token or not token.strip():
if original_token != token:
logger.debug(f"Skipped token (only emojis): '{original_token}'")
return
# Log if we filtered out emojis
if original_token != token:
logger.debug(f"Filtered emojis from token: '{original_token}' -> '{token}'")
# If not warmed up yet or no connection, queue the token
if not self.warmed_up or not self.websocket:
self.token_queue.append((token, pitch_shift))