Working with GUI, auto loopback creation, soprano streaming

2026-01-12 22:55:21 +02:00
commit 942ca36252
12 changed files with 1984 additions and 0 deletions
--- a/soprano_to_virtual_sink.py
+++ b/soprano_to_virtual_sink.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+"""
+Soprano TTS to Virtual Sink
+This script takes text input and streams Soprano TTS output to a virtual PulseAudio sink
+that can be used as input for RVC realtime voice conversion.
+"""
+
+import sys
+import os
+import subprocess
+import signal
+import sounddevice as sd
+import numpy as np
+import torch
+from scipy import signal as scipy_signal
+
+# Add soprano to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'soprano'))
+from soprano import SopranoTTS
+
+# Configuration
+VIRTUAL_SINK_NAME = "soprano_to_rvc"
+SAMPLE_RATE = 48000  # Use 48kHz for better compatibility with audio systems
+SOPRANO_RATE = 32000  # Soprano outputs at 32kHz
+CHANNELS = 2  # Use stereo to match RVC expectations
+
+# Global flag for graceful shutdown
+running = True
+
+
+def signal_handler(sig, frame):
+    """Handle Ctrl+C gracefully"""
+    global running
+    print("\n\nShutting down gracefully...")
+    running = False
+
+
+def create_virtual_sink():
+    """Create a PulseAudio virtual sink for audio output"""
+    # Check if sink already exists
+    try:
+        result = subprocess.run(
+            ["pactl", "list", "sinks", "short"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        if VIRTUAL_SINK_NAME in result.stdout:
+            print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' already exists")
+            print(f"  Monitor source: {VIRTUAL_SINK_NAME}.monitor")
+            return True
+    except subprocess.CalledProcessError:
+        pass
+    
+    print(f"Creating virtual sink: {VIRTUAL_SINK_NAME}")
+    try:
+        # Create a null sink (virtual audio device) at 48kHz for compatibility
+        subprocess.run([
+            "pactl", "load-module", "module-null-sink",
+            f"sink_name={VIRTUAL_SINK_NAME}",
+            f"sink_properties=device.description={VIRTUAL_SINK_NAME}",
+            f"rate={SAMPLE_RATE}",
+            "channels=2"  # Stereo to match RVC expectations
+        ], check=True, capture_output=True)
+        print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' created successfully")
+        print(f"  Monitor source: {VIRTUAL_SINK_NAME}.monitor")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"✗ Failed to create virtual sink: {e.stderr.decode()}")
+        return False
+
+
+def remove_virtual_sink():
+    """Remove the virtual sink on exit"""
+    print(f"\nRemoving virtual sink: {VIRTUAL_SINK_NAME}")
+    try:
+        # Find the module ID
+        result = subprocess.run(
+            ["pactl", "list", "modules", "short"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        for line in result.stdout.split('\n'):
+            if VIRTUAL_SINK_NAME in line:
+                module_id = line.split()[0]
+                subprocess.run(["pactl", "unload-module", module_id], check=True)
+                print(f"✓ Virtual sink removed")
+                return
+    except Exception as e:
+        print(f"✗ Error removing virtual sink: {e}")
+
+
+def get_virtual_sink_device_id():
+    """Get the sounddevice ID for our virtual sink"""
+    # Force refresh device list
+    sd._terminate()
+    sd._initialize()
+    
+    devices = sd.query_devices()
+    for i, device in enumerate(devices):
+        if VIRTUAL_SINK_NAME in device['name']:
+            return i
+    return None
+
+
+def stream_to_virtual_sink(tts_model, text, chunk_size=1):
+    """Stream soprano TTS output to the virtual sink"""
+    device_id = get_virtual_sink_device_id()
+    
+    if device_id is None:
+        print(f"✗ Could not find virtual sink device: {VIRTUAL_SINK_NAME}")
+        print(f"⚠️  Attempting to recreate virtual sink...")
+        if create_virtual_sink():
+            # Wait a moment for the device to appear
+            import time
+            time.sleep(1.0)  # Increased wait time
+            device_id = get_virtual_sink_device_id()
+            if device_id is None:
+                print(f"✗ Still could not find virtual sink after recreation")
+                print(f"\n📋 Available devices:")
+                devices = sd.query_devices()
+                for i, dev in enumerate(devices):
+                    if 'soprano' in dev['name'].lower() or 'rvc' in dev['name'].lower():
+                        print(f"   {i}: {dev['name']}")
+                return False
+        else:
+            return False
+    
+    device_info = sd.query_devices(device_id)
+    print(f"✓ Using output device: {device_info['name']}")
+    
+    # Get the device's default sample rate if 32kHz isn't supported
+    device_sr = int(device_info.get('default_samplerate', SAMPLE_RATE))
+    if device_sr == 0 or device_sr != SAMPLE_RATE:
+        device_sr = SAMPLE_RATE  # Try with soprano's rate anyway
+    
+    print(f"  Sample rate: {device_sr} Hz")
+    print(f"\n🎤 Generating and streaming speech...")
+    print(f"Text: \"{text}\"\n")
+    
+    try:
+        # Generate streaming audio from soprano
+        stream = tts_model.infer_stream(text, chunk_size=chunk_size)
+        
+        # Open output stream to virtual sink
+        with sd.OutputStream(
+            samplerate=SAMPLE_RATE,
+            channels=CHANNELS,
+            dtype='float32',
+            device=device_id,
+            blocksize=0
+        ) as out_stream:
+            first_chunk = True
+            for chunk in stream:
+                if not running:
+                    break
+                    
+                if first_chunk:
+                    print("✓ First audio chunk generated and streaming started")
+                    first_chunk = False
+                
+                # Convert torch tensor to numpy if needed
+                if isinstance(chunk, torch.Tensor):
+                    chunk = chunk.detach().cpu().numpy()
+                
+                # Ensure correct shape for mono audio
+                if chunk.ndim == 1:
+                    chunk_1d = chunk
+                elif chunk.ndim == 2 and chunk.shape[0] == 1:
+                    chunk_1d = chunk.flatten()
+                elif chunk.ndim == 2 and chunk.shape[1] == 1:
+                    chunk_1d = chunk.flatten()
+                else:
+                    chunk_1d = chunk.flatten()
+                
+                # Check for invalid values before resampling
+                if not np.all(np.isfinite(chunk_1d)):
+                    print(f"⚠️  Warning: Invalid values in soprano output, cleaning...")
+                    chunk_1d = np.nan_to_num(chunk_1d, nan=0.0, posinf=1.0, neginf=-1.0)
+                
+                # Resample from 32kHz (Soprano) to 48kHz (output) if needed
+                if SOPRANO_RATE != SAMPLE_RATE:
+                    num_samples = int(len(chunk_1d) * SAMPLE_RATE / SOPRANO_RATE)
+                    chunk_resampled = scipy_signal.resample(chunk_1d, num_samples)
+                else:
+                    chunk_resampled = chunk_1d
+                
+                # Ensure no NaN or inf values after resampling (clip to valid range)
+                if not np.all(np.isfinite(chunk_resampled)):
+                    print(f"⚠️  Warning: Invalid values after resampling, cleaning...")
+                chunk_resampled = np.nan_to_num(chunk_resampled, nan=0.0, posinf=1.0, neginf=-1.0)
+                chunk_resampled = np.clip(chunk_resampled, -1.0, 1.0)
+                
+                # Reshape to (N, 2) for stereo output (duplicate mono to both channels)
+                chunk_stereo = np.column_stack((chunk_resampled, chunk_resampled)).astype(np.float32)
+                
+                # Write to virtual sink
+                out_stream.write(chunk_stereo)
+        
+        print("✓ Speech generation and streaming completed")
+        return True
+        
+    except Exception as e:
+        print(f"✗ Error during streaming: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def main():
+    """Main function"""
+    global running
+    
+    # Set up signal handler for graceful shutdown
+    signal.signal(signal.SIGINT, signal_handler)
+    
+    print("=" * 70)
+    print("Soprano TTS to Virtual Sink for RVC")
+    print("=" * 70)
+    print()
+    
+    # Create virtual sink
+    if not create_virtual_sink():
+        print("\n⚠️  If sink already exists, removing and recreating...")
+        remove_virtual_sink()
+        if not create_virtual_sink():
+            print("✗ Failed to create virtual sink. Exiting.")
+            return 1
+    
+    print()
+    print("=" * 70)
+    print("Virtual sink setup complete!")
+    print("=" * 70)
+    print()
+    print("📝 Next steps:")
+    print(f"   1. Open RVC realtime GUI (gui_v1.py)")
+    print(f"   2. Select '{VIRTUAL_SINK_NAME}.monitor' as the INPUT device")
+    print(f"   3. Select your desired output device")
+    print(f"   4. Load your RVC model and start conversion")
+    print(f"   5. Return here and type text to convert")
+    print()
+    print("=" * 70)
+    print()
+    
+    # Initialize Soprano TTS
+    print("🔄 Loading Soprano TTS model...")
+    try:
+        tts = SopranoTTS(
+            backend='auto',
+            device='auto',
+            cache_size_mb=100,
+            decoder_batch_size=1
+        )
+        print("✓ Soprano TTS model loaded successfully")
+    except Exception as e:
+        print(f"✗ Failed to load Soprano TTS: {e}")
+        remove_virtual_sink()
+        return 1
+    
+    print()
+    print("=" * 70)
+    print("Ready! Type text to generate speech (Ctrl+C to exit)")
+    print("=" * 70)
+    print()
+    
+    # Main loop - get text input and generate speech
+    try:
+        while running:
+            try:
+                text = input("\n🎙️  Enter text: ").strip()
+                
+                if not text:
+                    print("⚠️  Please enter some text")
+                    continue
+                
+                if text.lower() in ['quit', 'exit', 'q']:
+                    break
+                
+                # Stream the text to the virtual sink
+                stream_to_virtual_sink(tts, text, chunk_size=1)
+                print()
+                
+            except EOFError:
+                break
+                
+    except KeyboardInterrupt:
+        print("\n\n⚠️  Interrupted by user")
+    
+    finally:
+        # Clean up
+        remove_virtual_sink()
+        print("\n✓ Cleanup complete. Goodbye!")
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())