300 lines
10 KiB
Python
300 lines
10 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Soprano TTS to Virtual Sink
|
||
|
|
This script takes text input and streams Soprano TTS output to a virtual PulseAudio sink
|
||
|
|
that can be used as input for RVC realtime voice conversion.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import os
|
||
|
|
import subprocess
|
||
|
|
import signal
|
||
|
|
import sounddevice as sd
|
||
|
|
import numpy as np
|
||
|
|
import torch
|
||
|
|
from scipy import signal as scipy_signal
|
||
|
|
|
||
|
|
# Add soprano to path
|
||
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'soprano'))
|
||
|
|
from soprano import SopranoTTS
|
||
|
|
|
||
|
|
# Configuration
|
||
|
|
VIRTUAL_SINK_NAME = "soprano_to_rvc"
|
||
|
|
SAMPLE_RATE = 48000 # Use 48kHz for better compatibility with audio systems
|
||
|
|
SOPRANO_RATE = 32000 # Soprano outputs at 32kHz
|
||
|
|
CHANNELS = 2 # Use stereo to match RVC expectations
|
||
|
|
|
||
|
|
# Global flag for graceful shutdown
|
||
|
|
running = True
|
||
|
|
|
||
|
|
|
||
|
|
def signal_handler(sig, frame):
|
||
|
|
"""Handle Ctrl+C gracefully"""
|
||
|
|
global running
|
||
|
|
print("\n\nShutting down gracefully...")
|
||
|
|
running = False
|
||
|
|
|
||
|
|
|
||
|
|
def create_virtual_sink():
|
||
|
|
"""Create a PulseAudio virtual sink for audio output"""
|
||
|
|
# Check if sink already exists
|
||
|
|
try:
|
||
|
|
result = subprocess.run(
|
||
|
|
["pactl", "list", "sinks", "short"],
|
||
|
|
capture_output=True,
|
||
|
|
text=True,
|
||
|
|
check=True
|
||
|
|
)
|
||
|
|
if VIRTUAL_SINK_NAME in result.stdout:
|
||
|
|
print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' already exists")
|
||
|
|
print(f" Monitor source: {VIRTUAL_SINK_NAME}.monitor")
|
||
|
|
return True
|
||
|
|
except subprocess.CalledProcessError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
print(f"Creating virtual sink: {VIRTUAL_SINK_NAME}")
|
||
|
|
try:
|
||
|
|
# Create a null sink (virtual audio device) at 48kHz for compatibility
|
||
|
|
subprocess.run([
|
||
|
|
"pactl", "load-module", "module-null-sink",
|
||
|
|
f"sink_name={VIRTUAL_SINK_NAME}",
|
||
|
|
f"sink_properties=device.description={VIRTUAL_SINK_NAME}",
|
||
|
|
f"rate={SAMPLE_RATE}",
|
||
|
|
"channels=2" # Stereo to match RVC expectations
|
||
|
|
], check=True, capture_output=True)
|
||
|
|
print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' created successfully")
|
||
|
|
print(f" Monitor source: {VIRTUAL_SINK_NAME}.monitor")
|
||
|
|
return True
|
||
|
|
except subprocess.CalledProcessError as e:
|
||
|
|
print(f"✗ Failed to create virtual sink: {e.stderr.decode()}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def remove_virtual_sink():
|
||
|
|
"""Remove the virtual sink on exit"""
|
||
|
|
print(f"\nRemoving virtual sink: {VIRTUAL_SINK_NAME}")
|
||
|
|
try:
|
||
|
|
# Find the module ID
|
||
|
|
result = subprocess.run(
|
||
|
|
["pactl", "list", "modules", "short"],
|
||
|
|
capture_output=True,
|
||
|
|
text=True,
|
||
|
|
check=True
|
||
|
|
)
|
||
|
|
for line in result.stdout.split('\n'):
|
||
|
|
if VIRTUAL_SINK_NAME in line:
|
||
|
|
module_id = line.split()[0]
|
||
|
|
subprocess.run(["pactl", "unload-module", module_id], check=True)
|
||
|
|
print(f"✓ Virtual sink removed")
|
||
|
|
return
|
||
|
|
except Exception as e:
|
||
|
|
print(f"✗ Error removing virtual sink: {e}")
|
||
|
|
|
||
|
|
|
||
|
|
def get_virtual_sink_device_id():
|
||
|
|
"""Get the sounddevice ID for our virtual sink"""
|
||
|
|
# Force refresh device list
|
||
|
|
sd._terminate()
|
||
|
|
sd._initialize()
|
||
|
|
|
||
|
|
devices = sd.query_devices()
|
||
|
|
for i, device in enumerate(devices):
|
||
|
|
if VIRTUAL_SINK_NAME in device['name']:
|
||
|
|
return i
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def stream_to_virtual_sink(tts_model, text, chunk_size=1):
|
||
|
|
"""Stream soprano TTS output to the virtual sink"""
|
||
|
|
device_id = get_virtual_sink_device_id()
|
||
|
|
|
||
|
|
if device_id is None:
|
||
|
|
print(f"✗ Could not find virtual sink device: {VIRTUAL_SINK_NAME}")
|
||
|
|
print(f"⚠️ Attempting to recreate virtual sink...")
|
||
|
|
if create_virtual_sink():
|
||
|
|
# Wait a moment for the device to appear
|
||
|
|
import time
|
||
|
|
time.sleep(1.0) # Increased wait time
|
||
|
|
device_id = get_virtual_sink_device_id()
|
||
|
|
if device_id is None:
|
||
|
|
print(f"✗ Still could not find virtual sink after recreation")
|
||
|
|
print(f"\n📋 Available devices:")
|
||
|
|
devices = sd.query_devices()
|
||
|
|
for i, dev in enumerate(devices):
|
||
|
|
if 'soprano' in dev['name'].lower() or 'rvc' in dev['name'].lower():
|
||
|
|
print(f" {i}: {dev['name']}")
|
||
|
|
return False
|
||
|
|
else:
|
||
|
|
return False
|
||
|
|
|
||
|
|
device_info = sd.query_devices(device_id)
|
||
|
|
print(f"✓ Using output device: {device_info['name']}")
|
||
|
|
|
||
|
|
# Get the device's default sample rate if 32kHz isn't supported
|
||
|
|
device_sr = int(device_info.get('default_samplerate', SAMPLE_RATE))
|
||
|
|
if device_sr == 0 or device_sr != SAMPLE_RATE:
|
||
|
|
device_sr = SAMPLE_RATE # Try with soprano's rate anyway
|
||
|
|
|
||
|
|
print(f" Sample rate: {device_sr} Hz")
|
||
|
|
print(f"\n🎤 Generating and streaming speech...")
|
||
|
|
print(f"Text: \"{text}\"\n")
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Generate streaming audio from soprano
|
||
|
|
stream = tts_model.infer_stream(text, chunk_size=chunk_size)
|
||
|
|
|
||
|
|
# Open output stream to virtual sink
|
||
|
|
with sd.OutputStream(
|
||
|
|
samplerate=SAMPLE_RATE,
|
||
|
|
channels=CHANNELS,
|
||
|
|
dtype='float32',
|
||
|
|
device=device_id,
|
||
|
|
blocksize=0
|
||
|
|
) as out_stream:
|
||
|
|
first_chunk = True
|
||
|
|
for chunk in stream:
|
||
|
|
if not running:
|
||
|
|
break
|
||
|
|
|
||
|
|
if first_chunk:
|
||
|
|
print("✓ First audio chunk generated and streaming started")
|
||
|
|
first_chunk = False
|
||
|
|
|
||
|
|
# Convert torch tensor to numpy if needed
|
||
|
|
if isinstance(chunk, torch.Tensor):
|
||
|
|
chunk = chunk.detach().cpu().numpy()
|
||
|
|
|
||
|
|
# Ensure correct shape for mono audio
|
||
|
|
if chunk.ndim == 1:
|
||
|
|
chunk_1d = chunk
|
||
|
|
elif chunk.ndim == 2 and chunk.shape[0] == 1:
|
||
|
|
chunk_1d = chunk.flatten()
|
||
|
|
elif chunk.ndim == 2 and chunk.shape[1] == 1:
|
||
|
|
chunk_1d = chunk.flatten()
|
||
|
|
else:
|
||
|
|
chunk_1d = chunk.flatten()
|
||
|
|
|
||
|
|
# Check for invalid values before resampling
|
||
|
|
if not np.all(np.isfinite(chunk_1d)):
|
||
|
|
print(f"⚠️ Warning: Invalid values in soprano output, cleaning...")
|
||
|
|
chunk_1d = np.nan_to_num(chunk_1d, nan=0.0, posinf=1.0, neginf=-1.0)
|
||
|
|
|
||
|
|
# Resample from 32kHz (Soprano) to 48kHz (output) if needed
|
||
|
|
if SOPRANO_RATE != SAMPLE_RATE:
|
||
|
|
num_samples = int(len(chunk_1d) * SAMPLE_RATE / SOPRANO_RATE)
|
||
|
|
chunk_resampled = scipy_signal.resample(chunk_1d, num_samples)
|
||
|
|
else:
|
||
|
|
chunk_resampled = chunk_1d
|
||
|
|
|
||
|
|
# Ensure no NaN or inf values after resampling (clip to valid range)
|
||
|
|
if not np.all(np.isfinite(chunk_resampled)):
|
||
|
|
print(f"⚠️ Warning: Invalid values after resampling, cleaning...")
|
||
|
|
chunk_resampled = np.nan_to_num(chunk_resampled, nan=0.0, posinf=1.0, neginf=-1.0)
|
||
|
|
chunk_resampled = np.clip(chunk_resampled, -1.0, 1.0)
|
||
|
|
|
||
|
|
# Reshape to (N, 2) for stereo output (duplicate mono to both channels)
|
||
|
|
chunk_stereo = np.column_stack((chunk_resampled, chunk_resampled)).astype(np.float32)
|
||
|
|
|
||
|
|
# Write to virtual sink
|
||
|
|
out_stream.write(chunk_stereo)
|
||
|
|
|
||
|
|
print("✓ Speech generation and streaming completed")
|
||
|
|
return True
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"✗ Error during streaming: {e}")
|
||
|
|
import traceback
|
||
|
|
traceback.print_exc()
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""Main function"""
|
||
|
|
global running
|
||
|
|
|
||
|
|
# Set up signal handler for graceful shutdown
|
||
|
|
signal.signal(signal.SIGINT, signal_handler)
|
||
|
|
|
||
|
|
print("=" * 70)
|
||
|
|
print("Soprano TTS to Virtual Sink for RVC")
|
||
|
|
print("=" * 70)
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Create virtual sink
|
||
|
|
if not create_virtual_sink():
|
||
|
|
print("\n⚠️ If sink already exists, removing and recreating...")
|
||
|
|
remove_virtual_sink()
|
||
|
|
if not create_virtual_sink():
|
||
|
|
print("✗ Failed to create virtual sink. Exiting.")
|
||
|
|
return 1
|
||
|
|
|
||
|
|
print()
|
||
|
|
print("=" * 70)
|
||
|
|
print("Virtual sink setup complete!")
|
||
|
|
print("=" * 70)
|
||
|
|
print()
|
||
|
|
print("📝 Next steps:")
|
||
|
|
print(f" 1. Open RVC realtime GUI (gui_v1.py)")
|
||
|
|
print(f" 2. Select '{VIRTUAL_SINK_NAME}.monitor' as the INPUT device")
|
||
|
|
print(f" 3. Select your desired output device")
|
||
|
|
print(f" 4. Load your RVC model and start conversion")
|
||
|
|
print(f" 5. Return here and type text to convert")
|
||
|
|
print()
|
||
|
|
print("=" * 70)
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Initialize Soprano TTS
|
||
|
|
print("🔄 Loading Soprano TTS model...")
|
||
|
|
try:
|
||
|
|
tts = SopranoTTS(
|
||
|
|
backend='auto',
|
||
|
|
device='auto',
|
||
|
|
cache_size_mb=100,
|
||
|
|
decoder_batch_size=1
|
||
|
|
)
|
||
|
|
print("✓ Soprano TTS model loaded successfully")
|
||
|
|
except Exception as e:
|
||
|
|
print(f"✗ Failed to load Soprano TTS: {e}")
|
||
|
|
remove_virtual_sink()
|
||
|
|
return 1
|
||
|
|
|
||
|
|
print()
|
||
|
|
print("=" * 70)
|
||
|
|
print("Ready! Type text to generate speech (Ctrl+C to exit)")
|
||
|
|
print("=" * 70)
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Main loop - get text input and generate speech
|
||
|
|
try:
|
||
|
|
while running:
|
||
|
|
try:
|
||
|
|
text = input("\n🎙️ Enter text: ").strip()
|
||
|
|
|
||
|
|
if not text:
|
||
|
|
print("⚠️ Please enter some text")
|
||
|
|
continue
|
||
|
|
|
||
|
|
if text.lower() in ['quit', 'exit', 'q']:
|
||
|
|
break
|
||
|
|
|
||
|
|
# Stream the text to the virtual sink
|
||
|
|
stream_to_virtual_sink(tts, text, chunk_size=1)
|
||
|
|
print()
|
||
|
|
|
||
|
|
except EOFError:
|
||
|
|
break
|
||
|
|
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
print("\n\n⚠️ Interrupted by user")
|
||
|
|
|
||
|
|
finally:
|
||
|
|
# Clean up
|
||
|
|
remove_virtual_sink()
|
||
|
|
print("\n✓ Cleanup complete. Goodbye!")
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
sys.exit(main())
|