Working with GUI, auto loopback creation, soprano streaming
This commit is contained in:
299
soprano_to_virtual_sink.py
Executable file
299
soprano_to_virtual_sink.py
Executable file
@@ -0,0 +1,299 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Soprano TTS to Virtual Sink
|
||||
This script takes text input and streams Soprano TTS output to a virtual PulseAudio sink
|
||||
that can be used as input for RVC realtime voice conversion.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import signal
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
import torch
|
||||
from scipy import signal as scipy_signal
|
||||
|
||||
# Add soprano to path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'soprano'))
|
||||
from soprano import SopranoTTS
|
||||
|
||||
# Configuration
|
||||
VIRTUAL_SINK_NAME = "soprano_to_rvc"
|
||||
SAMPLE_RATE = 48000 # Use 48kHz for better compatibility with audio systems
|
||||
SOPRANO_RATE = 32000 # Soprano outputs at 32kHz
|
||||
CHANNELS = 2 # Use stereo to match RVC expectations
|
||||
|
||||
# Global flag for graceful shutdown
|
||||
running = True
|
||||
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
"""Handle Ctrl+C gracefully"""
|
||||
global running
|
||||
print("\n\nShutting down gracefully...")
|
||||
running = False
|
||||
|
||||
|
||||
def create_virtual_sink():
|
||||
"""Create a PulseAudio virtual sink for audio output"""
|
||||
# Check if sink already exists
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["pactl", "list", "sinks", "short"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
if VIRTUAL_SINK_NAME in result.stdout:
|
||||
print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' already exists")
|
||||
print(f" Monitor source: {VIRTUAL_SINK_NAME}.monitor")
|
||||
return True
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
|
||||
print(f"Creating virtual sink: {VIRTUAL_SINK_NAME}")
|
||||
try:
|
||||
# Create a null sink (virtual audio device) at 48kHz for compatibility
|
||||
subprocess.run([
|
||||
"pactl", "load-module", "module-null-sink",
|
||||
f"sink_name={VIRTUAL_SINK_NAME}",
|
||||
f"sink_properties=device.description={VIRTUAL_SINK_NAME}",
|
||||
f"rate={SAMPLE_RATE}",
|
||||
"channels=2" # Stereo to match RVC expectations
|
||||
], check=True, capture_output=True)
|
||||
print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' created successfully")
|
||||
print(f" Monitor source: {VIRTUAL_SINK_NAME}.monitor")
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"✗ Failed to create virtual sink: {e.stderr.decode()}")
|
||||
return False
|
||||
|
||||
|
||||
def remove_virtual_sink():
|
||||
"""Remove the virtual sink on exit"""
|
||||
print(f"\nRemoving virtual sink: {VIRTUAL_SINK_NAME}")
|
||||
try:
|
||||
# Find the module ID
|
||||
result = subprocess.run(
|
||||
["pactl", "list", "modules", "short"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
for line in result.stdout.split('\n'):
|
||||
if VIRTUAL_SINK_NAME in line:
|
||||
module_id = line.split()[0]
|
||||
subprocess.run(["pactl", "unload-module", module_id], check=True)
|
||||
print(f"✓ Virtual sink removed")
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"✗ Error removing virtual sink: {e}")
|
||||
|
||||
|
||||
def get_virtual_sink_device_id():
|
||||
"""Get the sounddevice ID for our virtual sink"""
|
||||
# Force refresh device list
|
||||
sd._terminate()
|
||||
sd._initialize()
|
||||
|
||||
devices = sd.query_devices()
|
||||
for i, device in enumerate(devices):
|
||||
if VIRTUAL_SINK_NAME in device['name']:
|
||||
return i
|
||||
return None
|
||||
|
||||
|
||||
def stream_to_virtual_sink(tts_model, text, chunk_size=1):
|
||||
"""Stream soprano TTS output to the virtual sink"""
|
||||
device_id = get_virtual_sink_device_id()
|
||||
|
||||
if device_id is None:
|
||||
print(f"✗ Could not find virtual sink device: {VIRTUAL_SINK_NAME}")
|
||||
print(f"⚠️ Attempting to recreate virtual sink...")
|
||||
if create_virtual_sink():
|
||||
# Wait a moment for the device to appear
|
||||
import time
|
||||
time.sleep(1.0) # Increased wait time
|
||||
device_id = get_virtual_sink_device_id()
|
||||
if device_id is None:
|
||||
print(f"✗ Still could not find virtual sink after recreation")
|
||||
print(f"\n📋 Available devices:")
|
||||
devices = sd.query_devices()
|
||||
for i, dev in enumerate(devices):
|
||||
if 'soprano' in dev['name'].lower() or 'rvc' in dev['name'].lower():
|
||||
print(f" {i}: {dev['name']}")
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
device_info = sd.query_devices(device_id)
|
||||
print(f"✓ Using output device: {device_info['name']}")
|
||||
|
||||
# Get the device's default sample rate if 32kHz isn't supported
|
||||
device_sr = int(device_info.get('default_samplerate', SAMPLE_RATE))
|
||||
if device_sr == 0 or device_sr != SAMPLE_RATE:
|
||||
device_sr = SAMPLE_RATE # Try with soprano's rate anyway
|
||||
|
||||
print(f" Sample rate: {device_sr} Hz")
|
||||
print(f"\n🎤 Generating and streaming speech...")
|
||||
print(f"Text: \"{text}\"\n")
|
||||
|
||||
try:
|
||||
# Generate streaming audio from soprano
|
||||
stream = tts_model.infer_stream(text, chunk_size=chunk_size)
|
||||
|
||||
# Open output stream to virtual sink
|
||||
with sd.OutputStream(
|
||||
samplerate=SAMPLE_RATE,
|
||||
channels=CHANNELS,
|
||||
dtype='float32',
|
||||
device=device_id,
|
||||
blocksize=0
|
||||
) as out_stream:
|
||||
first_chunk = True
|
||||
for chunk in stream:
|
||||
if not running:
|
||||
break
|
||||
|
||||
if first_chunk:
|
||||
print("✓ First audio chunk generated and streaming started")
|
||||
first_chunk = False
|
||||
|
||||
# Convert torch tensor to numpy if needed
|
||||
if isinstance(chunk, torch.Tensor):
|
||||
chunk = chunk.detach().cpu().numpy()
|
||||
|
||||
# Ensure correct shape for mono audio
|
||||
if chunk.ndim == 1:
|
||||
chunk_1d = chunk
|
||||
elif chunk.ndim == 2 and chunk.shape[0] == 1:
|
||||
chunk_1d = chunk.flatten()
|
||||
elif chunk.ndim == 2 and chunk.shape[1] == 1:
|
||||
chunk_1d = chunk.flatten()
|
||||
else:
|
||||
chunk_1d = chunk.flatten()
|
||||
|
||||
# Check for invalid values before resampling
|
||||
if not np.all(np.isfinite(chunk_1d)):
|
||||
print(f"⚠️ Warning: Invalid values in soprano output, cleaning...")
|
||||
chunk_1d = np.nan_to_num(chunk_1d, nan=0.0, posinf=1.0, neginf=-1.0)
|
||||
|
||||
# Resample from 32kHz (Soprano) to 48kHz (output) if needed
|
||||
if SOPRANO_RATE != SAMPLE_RATE:
|
||||
num_samples = int(len(chunk_1d) * SAMPLE_RATE / SOPRANO_RATE)
|
||||
chunk_resampled = scipy_signal.resample(chunk_1d, num_samples)
|
||||
else:
|
||||
chunk_resampled = chunk_1d
|
||||
|
||||
# Ensure no NaN or inf values after resampling (clip to valid range)
|
||||
if not np.all(np.isfinite(chunk_resampled)):
|
||||
print(f"⚠️ Warning: Invalid values after resampling, cleaning...")
|
||||
chunk_resampled = np.nan_to_num(chunk_resampled, nan=0.0, posinf=1.0, neginf=-1.0)
|
||||
chunk_resampled = np.clip(chunk_resampled, -1.0, 1.0)
|
||||
|
||||
# Reshape to (N, 2) for stereo output (duplicate mono to both channels)
|
||||
chunk_stereo = np.column_stack((chunk_resampled, chunk_resampled)).astype(np.float32)
|
||||
|
||||
# Write to virtual sink
|
||||
out_stream.write(chunk_stereo)
|
||||
|
||||
print("✓ Speech generation and streaming completed")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error during streaming: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
global running
|
||||
|
||||
# Set up signal handler for graceful shutdown
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
print("=" * 70)
|
||||
print("Soprano TTS to Virtual Sink for RVC")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Create virtual sink
|
||||
if not create_virtual_sink():
|
||||
print("\n⚠️ If sink already exists, removing and recreating...")
|
||||
remove_virtual_sink()
|
||||
if not create_virtual_sink():
|
||||
print("✗ Failed to create virtual sink. Exiting.")
|
||||
return 1
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("Virtual sink setup complete!")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print("📝 Next steps:")
|
||||
print(f" 1. Open RVC realtime GUI (gui_v1.py)")
|
||||
print(f" 2. Select '{VIRTUAL_SINK_NAME}.monitor' as the INPUT device")
|
||||
print(f" 3. Select your desired output device")
|
||||
print(f" 4. Load your RVC model and start conversion")
|
||||
print(f" 5. Return here and type text to convert")
|
||||
print()
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Initialize Soprano TTS
|
||||
print("🔄 Loading Soprano TTS model...")
|
||||
try:
|
||||
tts = SopranoTTS(
|
||||
backend='auto',
|
||||
device='auto',
|
||||
cache_size_mb=100,
|
||||
decoder_batch_size=1
|
||||
)
|
||||
print("✓ Soprano TTS model loaded successfully")
|
||||
except Exception as e:
|
||||
print(f"✗ Failed to load Soprano TTS: {e}")
|
||||
remove_virtual_sink()
|
||||
return 1
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("Ready! Type text to generate speech (Ctrl+C to exit)")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Main loop - get text input and generate speech
|
||||
try:
|
||||
while running:
|
||||
try:
|
||||
text = input("\n🎙️ Enter text: ").strip()
|
||||
|
||||
if not text:
|
||||
print("⚠️ Please enter some text")
|
||||
continue
|
||||
|
||||
if text.lower() in ['quit', 'exit', 'q']:
|
||||
break
|
||||
|
||||
# Stream the text to the virtual sink
|
||||
stream_to_virtual_sink(tts, text, chunk_size=1)
|
||||
print()
|
||||
|
||||
except EOFError:
|
||||
break
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n⚠️ Interrupted by user")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
remove_virtual_sink()
|
||||
print("\n✓ Cleanup complete. Goodbye!")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user