Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.
This commit is contained in:
@@ -6,6 +6,7 @@ Uses aiohttp for WebSocket communication (compatible with FastAPI).
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import numpy as np
|
||||
from typing import Optional
|
||||
import discord
|
||||
@@ -29,6 +30,25 @@ CHANNELS = 2 # Stereo for Discord
|
||||
FRAME_LENGTH = 0.02 # 20ms frames
|
||||
SAMPLES_PER_FRAME = int(SAMPLE_RATE * FRAME_LENGTH) # 960 samples
|
||||
|
||||
# Emoji pattern for filtering
|
||||
# Covers most emoji ranges including emoticons, symbols, pictographs, etc.
|
||||
EMOJI_PATTERN = re.compile(
|
||||
"["
|
||||
"\U0001F600-\U0001F64F" # emoticons
|
||||
"\U0001F300-\U0001F5FF" # symbols & pictographs
|
||||
"\U0001F680-\U0001F6FF" # transport & map symbols
|
||||
"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
||||
"\U00002702-\U000027B0" # dingbats
|
||||
"\U000024C2-\U0001F251" # enclosed characters
|
||||
"\U0001F900-\U0001F9FF" # supplemental symbols and pictographs
|
||||
"\U0001FA00-\U0001FA6F" # chess symbols
|
||||
"\U0001FA70-\U0001FAFF" # symbols and pictographs extended-A
|
||||
"\U00002600-\U000026FF" # miscellaneous symbols
|
||||
"\U00002700-\U000027BF" # dingbats
|
||||
"]+",
|
||||
flags=re.UNICODE
|
||||
)
|
||||
|
||||
|
||||
class MikuVoiceSource(discord.AudioSource):
|
||||
"""
|
||||
@@ -38,8 +58,9 @@ class MikuVoiceSource(discord.AudioSource):
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.websocket_url = "ws://172.25.0.1:8765/ws/stream"
|
||||
self.health_url = "http://172.25.0.1:8765/health"
|
||||
# Use Docker hostname for RVC service (miku-rvc-api is on miku-voice-network)
|
||||
self.websocket_url = "ws://miku-rvc-api:8765/ws/stream"
|
||||
self.health_url = "http://miku-rvc-api:8765/health"
|
||||
self.session = None
|
||||
self.websocket = None
|
||||
self.audio_buffer = bytearray()
|
||||
@@ -230,11 +251,26 @@ class MikuVoiceSource(discord.AudioSource):
|
||||
"""
|
||||
Send a text token to TTS for voice generation.
|
||||
Queues tokens if pipeline is still warming up or connection failed.
|
||||
Filters out emojis to prevent TTS hallucinations.
|
||||
|
||||
Args:
|
||||
token: Text token to synthesize
|
||||
pitch_shift: Pitch adjustment (-12 to +12 semitones)
|
||||
"""
|
||||
# Filter out emojis from the token (preserve whitespace!)
|
||||
original_token = token
|
||||
token = EMOJI_PATTERN.sub('', token)
|
||||
|
||||
# If token is now empty or only whitespace after emoji removal, skip it
|
||||
if not token or not token.strip():
|
||||
if original_token != token:
|
||||
logger.debug(f"Skipped token (only emojis): '{original_token}'")
|
||||
return
|
||||
|
||||
# Log if we filtered out emojis
|
||||
if original_token != token:
|
||||
logger.debug(f"Filtered emojis from token: '{original_token}' -> '{token}'")
|
||||
|
||||
# If not warmed up yet or no connection, queue the token
|
||||
if not self.warmed_up or not self.websocket:
|
||||
self.token_queue.append((token, pitch_shift))
|
||||
|
||||
Reference in New Issue
Block a user