soprano_to_virtual_sink.py

#!/usr/bin/env python3
"""
Soprano TTS to Virtual Sink
This script takes text input and streams Soprano TTS output to a virtual PulseAudio sink
that can be used as input for RVC realtime voice conversion.
"""

import sys
import os
import subprocess
import signal
import sounddevice as sd
import numpy as np
import torch
from scipy import signal as scipy_signal

# Add soprano to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'soprano'))
from soprano import SopranoTTS

# Configuration
VIRTUAL_SINK_NAME = "soprano_to_rvc"
SAMPLE_RATE = 48000  # Use 48kHz for better compatibility with audio systems
SOPRANO_RATE = 32000  # Soprano outputs at 32kHz
CHANNELS = 2  # Use stereo to match RVC expectations

# Global flag for graceful shutdown
running = True


def signal_handler(sig, frame):
    """Handle Ctrl+C gracefully"""
    global running
    print("\n\nShutting down gracefully...")
    running = False


def create_virtual_sink():
    """Create a PulseAudio virtual sink for audio output"""
    # Check if sink already exists
    try:
        result = subprocess.run(
            ["pactl", "list", "sinks", "short"],
            capture_output=True,
            text=True,
            check=True
        )
        if VIRTUAL_SINK_NAME in result.stdout:
            print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' already exists")
            print(f"  Monitor source: {VIRTUAL_SINK_NAME}.monitor")
            return True
    except subprocess.CalledProcessError:
        pass
    
    print(f"Creating virtual sink: {VIRTUAL_SINK_NAME}")
    try:
        # Create a null sink (virtual audio device) at 48kHz for compatibility
        subprocess.run([
            "pactl", "load-module", "module-null-sink",
            f"sink_name={VIRTUAL_SINK_NAME}",
            f"sink_properties=device.description={VIRTUAL_SINK_NAME}",
            f"rate={SAMPLE_RATE}",
            "channels=2"  # Stereo to match RVC expectations
        ], check=True, capture_output=True)
        print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' created successfully")
        print(f"  Monitor source: {VIRTUAL_SINK_NAME}.monitor")
        return True
    except subprocess.CalledProcessError as e:
        print(f"✗ Failed to create virtual sink: {e.stderr.decode()}")
        return False


def remove_virtual_sink():
    """Remove the virtual sink on exit"""
    print(f"\nRemoving virtual sink: {VIRTUAL_SINK_NAME}")
    try:
        # Find the module ID
        result = subprocess.run(
            ["pactl", "list", "modules", "short"],
            capture_output=True,
            text=True,
            check=True
        )
        for line in result.stdout.split('\n'):
            if VIRTUAL_SINK_NAME in line:
                module_id = line.split()[0]
                subprocess.run(["pactl", "unload-module", module_id], check=True)
                print(f"✓ Virtual sink removed")
                return
    except Exception as e:
        print(f"✗ Error removing virtual sink: {e}")


def get_virtual_sink_device_id():
    """Get the sounddevice ID for our virtual sink"""
    # Force refresh device list
    sd._terminate()
    sd._initialize()
    
    devices = sd.query_devices()
    for i, device in enumerate(devices):
        if VIRTUAL_SINK_NAME in device['name']:
            return i
    return None


def stream_to_virtual_sink(tts_model, text, chunk_size=1):
    """Stream soprano TTS output to the virtual sink"""
    device_id = get_virtual_sink_device_id()
    
    if device_id is None:
        print(f"✗ Could not find virtual sink device: {VIRTUAL_SINK_NAME}")
        print(f"⚠️  Attempting to recreate virtual sink...")
        if create_virtual_sink():
            # Wait a moment for the device to appear
            import time
            time.sleep(1.0)  # Increased wait time
            device_id = get_virtual_sink_device_id()
            if device_id is None:
                print(f"✗ Still could not find virtual sink after recreation")
                print(f"\n📋 Available devices:")
                devices = sd.query_devices()
                for i, dev in enumerate(devices):
                    if 'soprano' in dev['name'].lower() or 'rvc' in dev['name'].lower():
                        print(f"   {i}: {dev['name']}")
                return False
        else:
            return False
    
    device_info = sd.query_devices(device_id)
    print(f"✓ Using output device: {device_info['name']}")
    
    # Get the device's default sample rate if 32kHz isn't supported
    device_sr = int(device_info.get('default_samplerate', SAMPLE_RATE))
    if device_sr == 0 or device_sr != SAMPLE_RATE:
        device_sr = SAMPLE_RATE  # Try with soprano's rate anyway
    
    print(f"  Sample rate: {device_sr} Hz")
    print(f"\n🎤 Generating and streaming speech...")
    print(f"Text: \"{text}\"\n")
    
    try:
        # Generate streaming audio from soprano
        stream = tts_model.infer_stream(text, chunk_size=chunk_size)
        
        # Open output stream to virtual sink
        with sd.OutputStream(
            samplerate=SAMPLE_RATE,
            channels=CHANNELS,
            dtype='float32',
            device=device_id,
            blocksize=0
        ) as out_stream:
            first_chunk = True
            for chunk in stream:
                if not running:
                    break
                    
                if first_chunk:
                    print("✓ First audio chunk generated and streaming started")
                    first_chunk = False
                
                # Convert torch tensor to numpy if needed
                if isinstance(chunk, torch.Tensor):
                    chunk = chunk.detach().cpu().numpy()
                
                # Ensure correct shape for mono audio
                if chunk.ndim == 1:
                    chunk_1d = chunk
                elif chunk.ndim == 2 and chunk.shape[0] == 1:
                    chunk_1d = chunk.flatten()
                elif chunk.ndim == 2 and chunk.shape[1] == 1:
                    chunk_1d = chunk.flatten()
                else:
                    chunk_1d = chunk.flatten()
                
                # Check for invalid values before resampling
                if not np.all(np.isfinite(chunk_1d)):
                    print(f"⚠️  Warning: Invalid values in soprano output, cleaning...")
                    chunk_1d = np.nan_to_num(chunk_1d, nan=0.0, posinf=1.0, neginf=-1.0)
                
                # Resample from 32kHz (Soprano) to 48kHz (output) if needed
                if SOPRANO_RATE != SAMPLE_RATE:
                    num_samples = int(len(chunk_1d) * SAMPLE_RATE / SOPRANO_RATE)
                    chunk_resampled = scipy_signal.resample(chunk_1d, num_samples)
                else:
                    chunk_resampled = chunk_1d
                
                # Ensure no NaN or inf values after resampling (clip to valid range)
                if not np.all(np.isfinite(chunk_resampled)):
                    print(f"⚠️  Warning: Invalid values after resampling, cleaning...")
                chunk_resampled = np.nan_to_num(chunk_resampled, nan=0.0, posinf=1.0, neginf=-1.0)
                chunk_resampled = np.clip(chunk_resampled, -1.0, 1.0)
                
                # Reshape to (N, 2) for stereo output (duplicate mono to both channels)
                chunk_stereo = np.column_stack((chunk_resampled, chunk_resampled)).astype(np.float32)
                
                # Write to virtual sink
                out_stream.write(chunk_stereo)
        
        print("✓ Speech generation and streaming completed")
        return True
        
    except Exception as e:
        print(f"✗ Error during streaming: {e}")
        import traceback
        traceback.print_exc()
        return False


def main():
    """Main function"""
    global running
    
    # Set up signal handler for graceful shutdown
    signal.signal(signal.SIGINT, signal_handler)
    
    print("=" * 70)
    print("Soprano TTS to Virtual Sink for RVC")
    print("=" * 70)
    print()
    
    # Create virtual sink
    if not create_virtual_sink():
        print("\n⚠️  If sink already exists, removing and recreating...")
        remove_virtual_sink()
        if not create_virtual_sink():
            print("✗ Failed to create virtual sink. Exiting.")
            return 1
    
    print()
    print("=" * 70)
    print("Virtual sink setup complete!")
    print("=" * 70)
    print()
    print("📝 Next steps:")
    print(f"   1. Open RVC realtime GUI (gui_v1.py)")
    print(f"   2. Select '{VIRTUAL_SINK_NAME}.monitor' as the INPUT device")
    print(f"   3. Select your desired output device")
    print(f"   4. Load your RVC model and start conversion")
    print(f"   5. Return here and type text to convert")
    print()
    print("=" * 70)
    print()
    
    # Initialize Soprano TTS
    print("🔄 Loading Soprano TTS model...")
    try:
        tts = SopranoTTS(
            backend='auto',
            device='auto',
            cache_size_mb=100,
            decoder_batch_size=1
        )
        print("✓ Soprano TTS model loaded successfully")
    except Exception as e:
        print(f"✗ Failed to load Soprano TTS: {e}")
        remove_virtual_sink()
        return 1
    
    print()
    print("=" * 70)
    print("Ready! Type text to generate speech (Ctrl+C to exit)")
    print("=" * 70)
    print()
    
    # Main loop - get text input and generate speech
    try:
        while running:
            try:
                text = input("\n🎙️  Enter text: ").strip()
                
                if not text:
                    print("⚠️  Please enter some text")
                    continue
                
                if text.lower() in ['quit', 'exit', 'q']:
                    break
                
                # Stream the text to the virtual sink
                stream_to_virtual_sink(tts, text, chunk_size=1)
                print()
                
            except EOFError:
                break
                
    except KeyboardInterrupt:
        print("\n\n⚠️  Interrupted by user")
    
    finally:
        # Clean up
        remove_virtual_sink()
        print("\n✓ Cleanup complete. Goodbye!")
    
    return 0


if __name__ == "__main__":
    sys.exit(main())
Working with GUI, auto loopback creation, soprano streaming 2026-01-12 22:55:21 +02:00			`#!/usr/bin/env python3`
			`"""`
			`Soprano TTS to Virtual Sink`
			`This script takes text input and streams Soprano TTS output to a virtual PulseAudio sink`
			`that can be used as input for RVC realtime voice conversion.`
			`"""`

			`import sys`
			`import os`
			`import subprocess`
			`import signal`
			`import sounddevice as sd`
			`import numpy as np`
			`import torch`
			`from scipy import signal as scipy_signal`

			`# Add soprano to path`
			`sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'soprano'))`
			`from soprano import SopranoTTS`

			`# Configuration`
			`VIRTUAL_SINK_NAME = "soprano_to_rvc"`
			`SAMPLE_RATE = 48000 # Use 48kHz for better compatibility with audio systems`
			`SOPRANO_RATE = 32000 # Soprano outputs at 32kHz`
			`CHANNELS = 2 # Use stereo to match RVC expectations`

			`# Global flag for graceful shutdown`
			`running = True`


			`def signal_handler(sig, frame):`
			`"""Handle Ctrl+C gracefully"""`
			`global running`
			`print("\n\nShutting down gracefully...")`
			`running = False`


			`def create_virtual_sink():`
			`"""Create a PulseAudio virtual sink for audio output"""`
			`# Check if sink already exists`
			`try:`
			`result = subprocess.run(`
			`["pactl", "list", "sinks", "short"],`
			`capture_output=True,`
			`text=True,`
			`check=True`
			`)`
			`if VIRTUAL_SINK_NAME in result.stdout:`
			`print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' already exists")`
			`print(f" Monitor source: {VIRTUAL_SINK_NAME}.monitor")`
			`return True`
			`except subprocess.CalledProcessError:`
			`pass`

			`print(f"Creating virtual sink: {VIRTUAL_SINK_NAME}")`
			`try:`
			`# Create a null sink (virtual audio device) at 48kHz for compatibility`
			`subprocess.run([`
			`"pactl", "load-module", "module-null-sink",`
			`f"sink_name={VIRTUAL_SINK_NAME}",`
			`f"sink_properties=device.description={VIRTUAL_SINK_NAME}",`
			`f"rate={SAMPLE_RATE}",`
			`"channels=2" # Stereo to match RVC expectations`
			`], check=True, capture_output=True)`
			`print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' created successfully")`
			`print(f" Monitor source: {VIRTUAL_SINK_NAME}.monitor")`
			`return True`
			`except subprocess.CalledProcessError as e:`
			`print(f"✗ Failed to create virtual sink: {e.stderr.decode()}")`
			`return False`


			`def remove_virtual_sink():`
			`"""Remove the virtual sink on exit"""`
			`print(f"\nRemoving virtual sink: {VIRTUAL_SINK_NAME}")`
			`try:`
			`# Find the module ID`
			`result = subprocess.run(`
			`["pactl", "list", "modules", "short"],`
			`capture_output=True,`
			`text=True,`
			`check=True`
			`)`
			`for line in result.stdout.split('\n'):`
			`if VIRTUAL_SINK_NAME in line:`
			`module_id = line.split()[0]`
			`subprocess.run(["pactl", "unload-module", module_id], check=True)`
			`print(f"✓ Virtual sink removed")`
			`return`
			`except Exception as e:`
			`print(f"✗ Error removing virtual sink: {e}")`


			`def get_virtual_sink_device_id():`
			`"""Get the sounddevice ID for our virtual sink"""`
			`# Force refresh device list`
			`sd._terminate()`
			`sd._initialize()`

			`devices = sd.query_devices()`
			`for i, device in enumerate(devices):`
			`if VIRTUAL_SINK_NAME in device['name']:`
			`return i`
			`return None`


			`def stream_to_virtual_sink(tts_model, text, chunk_size=1):`
			`"""Stream soprano TTS output to the virtual sink"""`
			`device_id = get_virtual_sink_device_id()`

			`if device_id is None:`
			`print(f"✗ Could not find virtual sink device: {VIRTUAL_SINK_NAME}")`
			`print(f"⚠️ Attempting to recreate virtual sink...")`
			`if create_virtual_sink():`
			`# Wait a moment for the device to appear`
			`import time`
			`time.sleep(1.0) # Increased wait time`
			`device_id = get_virtual_sink_device_id()`
			`if device_id is None:`
			`print(f"✗ Still could not find virtual sink after recreation")`
			`print(f"\n📋 Available devices:")`
			`devices = sd.query_devices()`
			`for i, dev in enumerate(devices):`
			`if 'soprano' in dev['name'].lower() or 'rvc' in dev['name'].lower():`
			`print(f" {i}: {dev['name']}")`
			`return False`
			`else:`
			`return False`

			`device_info = sd.query_devices(device_id)`
			`print(f"✓ Using output device: {device_info['name']}")`

			`# Get the device's default sample rate if 32kHz isn't supported`
			`device_sr = int(device_info.get('default_samplerate', SAMPLE_RATE))`
			`if device_sr == 0 or device_sr != SAMPLE_RATE:`
			`device_sr = SAMPLE_RATE # Try with soprano's rate anyway`

			`print(f" Sample rate: {device_sr} Hz")`
			`print(f"\n🎤 Generating and streaming speech...")`
			`print(f"Text: \"{text}\"\n")`

			`try:`
			`# Generate streaming audio from soprano`
			`stream = tts_model.infer_stream(text, chunk_size=chunk_size)`

			`# Open output stream to virtual sink`
			`with sd.OutputStream(`
			`samplerate=SAMPLE_RATE,`
			`channels=CHANNELS,`
			`dtype='float32',`
			`device=device_id,`
			`blocksize=0`
			`) as out_stream:`
			`first_chunk = True`
			`for chunk in stream:`
			`if not running:`
			`break`

			`if first_chunk:`
			`print("✓ First audio chunk generated and streaming started")`
			`first_chunk = False`

			`# Convert torch tensor to numpy if needed`
			`if isinstance(chunk, torch.Tensor):`
			`chunk = chunk.detach().cpu().numpy()`

			`# Ensure correct shape for mono audio`
			`if chunk.ndim == 1:`
			`chunk_1d = chunk`
			`elif chunk.ndim == 2 and chunk.shape[0] == 1:`
			`chunk_1d = chunk.flatten()`
			`elif chunk.ndim == 2 and chunk.shape[1] == 1:`
			`chunk_1d = chunk.flatten()`
			`else:`
			`chunk_1d = chunk.flatten()`

			`# Check for invalid values before resampling`
			`if not np.all(np.isfinite(chunk_1d)):`
			`print(f"⚠️ Warning: Invalid values in soprano output, cleaning...")`
			`chunk_1d = np.nan_to_num(chunk_1d, nan=0.0, posinf=1.0, neginf=-1.0)`

			`# Resample from 32kHz (Soprano) to 48kHz (output) if needed`
			`if SOPRANO_RATE != SAMPLE_RATE:`
			`num_samples = int(len(chunk_1d) * SAMPLE_RATE / SOPRANO_RATE)`
			`chunk_resampled = scipy_signal.resample(chunk_1d, num_samples)`
			`else:`
			`chunk_resampled = chunk_1d`

			`# Ensure no NaN or inf values after resampling (clip to valid range)`
			`if not np.all(np.isfinite(chunk_resampled)):`
			`print(f"⚠️ Warning: Invalid values after resampling, cleaning...")`
			`chunk_resampled = np.nan_to_num(chunk_resampled, nan=0.0, posinf=1.0, neginf=-1.0)`
			`chunk_resampled = np.clip(chunk_resampled, -1.0, 1.0)`

			`# Reshape to (N, 2) for stereo output (duplicate mono to both channels)`
			`chunk_stereo = np.column_stack((chunk_resampled, chunk_resampled)).astype(np.float32)`

			`# Write to virtual sink`
			`out_stream.write(chunk_stereo)`

			`print("✓ Speech generation and streaming completed")`
			`return True`

			`except Exception as e:`
			`print(f"✗ Error during streaming: {e}")`
			`import traceback`
			`traceback.print_exc()`
			`return False`


			`def main():`
			`"""Main function"""`
			`global running`

			`# Set up signal handler for graceful shutdown`
			`signal.signal(signal.SIGINT, signal_handler)`

			`print("=" * 70)`
			`print("Soprano TTS to Virtual Sink for RVC")`
			`print("=" * 70)`
			`print()`

			`# Create virtual sink`
			`if not create_virtual_sink():`
			`print("\n⚠️ If sink already exists, removing and recreating...")`
			`remove_virtual_sink()`
			`if not create_virtual_sink():`
			`print("✗ Failed to create virtual sink. Exiting.")`
			`return 1`

			`print()`
			`print("=" * 70)`
			`print("Virtual sink setup complete!")`
			`print("=" * 70)`
			`print()`
			`print("📝 Next steps:")`
			`print(f" 1. Open RVC realtime GUI (gui_v1.py)")`
			`print(f" 2. Select '{VIRTUAL_SINK_NAME}.monitor' as the INPUT device")`
			`print(f" 3. Select your desired output device")`
			`print(f" 4. Load your RVC model and start conversion")`
			`print(f" 5. Return here and type text to convert")`
			`print()`
			`print("=" * 70)`
			`print()`

			`# Initialize Soprano TTS`
			`print("🔄 Loading Soprano TTS model...")`
			`try:`
			`tts = SopranoTTS(`
			`backend='auto',`
			`device='auto',`
			`cache_size_mb=100,`
			`decoder_batch_size=1`
			`)`
			`print("✓ Soprano TTS model loaded successfully")`
			`except Exception as e:`
			`print(f"✗ Failed to load Soprano TTS: {e}")`
			`remove_virtual_sink()`
			`return 1`

			`print()`
			`print("=" * 70)`
			`print("Ready! Type text to generate speech (Ctrl+C to exit)")`
			`print("=" * 70)`
			`print()`

			`# Main loop - get text input and generate speech`
			`try:`
			`while running:`
			`try:`
			`text = input("\n🎙️ Enter text: ").strip()`

			`if not text:`
			`print("⚠️ Please enter some text")`
			`continue`

			`if text.lower() in ['quit', 'exit', 'q']:`
			`break`

			`# Stream the text to the virtual sink`
			`stream_to_virtual_sink(tts, text, chunk_size=1)`
			`print()`

			`except EOFError:`
			`break`

			`except KeyboardInterrupt:`
			`print("\n\n⚠️ Interrupted by user")`

			`finally:`
			`# Clean up`
			`remove_virtual_sink()`
			`print("\n✓ Cleanup complete. Goodbye!")`

			`return 0`


			`if __name__ == "__main__":`
			`sys.exit(main())`