unified soprano to rvc script

2026-01-13 00:20:55 +02:00
parent 5eedbb80e4
commit 346f9ccbda
4 changed files with 469 additions and 2 deletions
--- a/unified_soprano_rvc.py
+++ b/unified_soprano_rvc.py
@@ -0,0 +1,415 @@
+#!/usr/bin/env python3
+"""
+Unified Soprano TTS + RVC Pipeline
+Combines soprano_to_virtual_sink.py and headless_rvc.py into a single interface
+"""
+
+import sys
+import os
+import json
+import argparse
+import threading
+import time
+import subprocess
+import warnings
+import logging
+from pathlib import Path
+from dataclasses import dataclass, asdict
+from typing import Optional
+from contextlib import redirect_stdout, redirect_stderr
+
+# Configure logging - only show INFO and above
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(levelname)s: %(message)s'
+)
+# Suppress debug logs from all modules
+logging.getLogger().setLevel(logging.INFO)
+
+# Suppress warnings
+warnings.filterwarnings('ignore')
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+os.environ['MIOPEN_LOG_LEVEL'] = '1'  # Suppress MIOpen warnings
+
+# Add soprano to path
+sys.path.insert(0, str(Path(__file__).parent / "soprano"))
+
+import numpy as np
+import sounddevice as sd
+from scipy import signal as scipy_signal
+import torch
+
+# Import soprano
+from soprano import SopranoTTS
+
+
+@dataclass
+class RVCConfig:
+    """RVC configuration parameters"""
+    pth: str
+    index: str
+    pitch: int = 0
+    formant: float = 0.0
+    index_rate: float = 0.75
+    filter_radius: int = 3
+    rms_mix_rate: float = 0.25
+    protect: float = 0.33
+    f0method: str = "rmvpe"
+    input_device: str = "soprano_rvc"
+    output_device: Optional[str] = None
+    samplerate: int = 48000
+    channels: int = 2
+    block_time: float = 0.25
+    crossfade_time: float = 0.04
+    extra_time: float = 2.5
+    n_cpu: int = 4
+    I_noise_reduce: bool = False
+    O_noise_reduce: bool = False
+    use_pv: bool = True
+    threshold: float = -60.0
+
+    @classmethod
+    def from_file(cls, config_path: str) -> 'RVCConfig':
+        """Load configuration from JSON file"""
+        with open(config_path, 'r') as f:
+            data = json.load(f)
+        return cls(**data)
+    
+    def to_file(self, config_path: str):
+        """Save configuration to JSON file"""
+        with open(config_path, 'w') as f:
+            json.dump(asdict(self), f, indent=2)
+
+
+class UnifiedPipeline:
+    """Unified Soprano TTS + RVC pipeline"""
+    
+    def __init__(self, rvc_config: RVCConfig, virtual_sink_name: str = "soprano_to_rvc"):
+        self.rvc_config = rvc_config
+        self.virtual_sink_name = virtual_sink_name
+        self.soprano = None
+        self.rvc_process = None
+        self.rvc_thread = None
+        self.soprano_stream = None
+        self.running = False
+        
+        # Soprano audio parameters
+        self.soprano_sample_rate = 32000
+        self.virtual_sink_sample_rate = 48000
+        
+    def ensure_virtual_sink(self):
+        """Ensure PulseAudio virtual sink exists"""
+        print("Checking virtual sink...")
+        
+        # Check if sink exists
+        result = subprocess.run(
+            ["pactl", "list", "sinks", "short"],
+            capture_output=True,
+            text=True
+        )
+        
+        if self.virtual_sink_name not in result.stdout:
+            print(f"Creating virtual sink: {self.virtual_sink_name}")
+            subprocess.run([
+                "pactl", "load-module", "module-null-sink",
+                f"sink_name={self.virtual_sink_name}",
+                f"sink_properties=device.description='Soprano_to_RVC_Virtual_Sink'",
+                f"rate={self.virtual_sink_sample_rate}",
+                "channels=2"
+            ])
+            time.sleep(0.5)
+        else:
+            print(f"✓ Virtual sink '{self.virtual_sink_name}' already exists")
+    
+    def initialize_soprano(self):
+        """Initialize Soprano TTS"""
+        print("\n" + "="*70)
+        print("Initializing Soprano TTS...")
+        print("="*70)
+        
+        # Suppress verbose output during initialization
+        with open(os.devnull, 'w') as devnull:
+            with redirect_stdout(devnull), redirect_stderr(devnull):
+                self.soprano = SopranoTTS(device="cuda")
+        
+        # Open audio stream to virtual sink
+        try:
+            self.soprano_stream = sd.OutputStream(
+                device=self.virtual_sink_name,
+                samplerate=self.virtual_sink_sample_rate,
+                channels=2,
+                dtype='float32',
+                blocksize=1024
+            )
+            self.soprano_stream.start()
+            print("✓ Soprano TTS initialized successfully")
+            print(f"  Output: {self.virtual_sink_name} ({self.virtual_sink_sample_rate}Hz, stereo)")
+        except Exception as e:
+            print(f"✗ Failed to open audio stream: {e}")
+            raise
+    
+    def start_rvc(self):
+        """Start headless RVC in a separate thread"""
+        print("\n" + "="*70)
+        print("Starting RVC Voice Conversion...")
+        print("="*70)
+        
+        def run_rvc():
+            # Suppress logging from RVC
+            import logging
+            logging.getLogger('faiss').setLevel(logging.ERROR)
+            logging.getLogger('fairseq').setLevel(logging.ERROR)
+            
+            # Import here to avoid conflicts
+            from headless_rvc import HeadlessRVC, HeadlessRVCConfig
+            
+            # Redirect RVC output
+            with open(os.devnull, 'w') as devnull:
+                with redirect_stdout(devnull), redirect_stderr(devnull):
+                    # Convert our config to HeadlessRVCConfig
+                    config_dict = {
+                        'pth_path': self.rvc_config.pth,
+                        'index_path': self.rvc_config.index,
+                        'pitch': self.rvc_config.pitch,
+                        'formant': self.rvc_config.formant,
+                        'index_rate': self.rvc_config.index_rate,
+                        'filter_radius': self.rvc_config.filter_radius,
+                        'rms_mix_rate': self.rvc_config.rms_mix_rate,
+                        'protect': self.rvc_config.protect,
+                        'f0method': self.rvc_config.f0method,
+                        'input_device': self.rvc_config.input_device,
+                        'output_device': self.rvc_config.output_device,
+                        'samplerate': self.rvc_config.samplerate,
+                        'channels': self.rvc_config.channels,
+                        'block_time': self.rvc_config.block_time,
+                        'crossfade_time': self.rvc_config.crossfade_time,
+                        'extra_time': self.rvc_config.extra_time,
+                        'n_cpu': self.rvc_config.n_cpu,
+                        'I_noise_reduce': self.rvc_config.I_noise_reduce,
+                        'O_noise_reduce': self.rvc_config.O_noise_reduce,
+                        'use_pv': self.rvc_config.use_pv,
+                        'threshold': self.rvc_config.threshold
+                    }
+                    gui_config = HeadlessRVCConfig(config_dict)
+                    
+                    self.rvc = HeadlessRVC(gui_config)
+                    self.rvc.start()
+            
+            # Keep running until stopped
+            while self.running:
+                time.sleep(0.1)
+            
+            # Suppress stop output too
+            with open(os.devnull, 'w') as devnull:
+                with redirect_stdout(devnull), redirect_stderr(devnull):
+                    self.rvc.stop()
+        
+        self.rvc_thread = threading.Thread(target=run_rvc, daemon=True)
+        self.running = True
+        self.rvc_thread.start()
+        
+        # Wait for RVC to initialize
+        time.sleep(3)  # Give it time to load models
+        print("✓ RVC initialized successfully")
+    
+    def stream_audio_chunk(self, audio_chunk):
+        """Stream an audio chunk to the virtual sink"""
+        if audio_chunk is None:
+            return
+        
+        # Convert torch tensor to numpy if needed
+        if torch.is_tensor(audio_chunk):
+            audio_chunk = audio_chunk.cpu().numpy()
+        
+        if len(audio_chunk) == 0:
+            return
+        
+        # Ensure float32
+        audio_chunk = audio_chunk.astype(np.float32)
+        
+        # Resample from 32kHz to 48kHz
+        if self.soprano_sample_rate != self.virtual_sink_sample_rate:
+            num_samples_output = int(len(audio_chunk) * self.virtual_sink_sample_rate / self.soprano_sample_rate)
+            audio_chunk = scipy_signal.resample(audio_chunk, num_samples_output)
+        
+        # Clean audio (handle NaN/inf)
+        audio_chunk = np.nan_to_num(audio_chunk, nan=0.0, posinf=0.0, neginf=0.0)
+        audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
+        
+        # Convert mono to stereo
+        if audio_chunk.ndim == 1:
+            audio_chunk = np.column_stack((audio_chunk, audio_chunk))
+        
+        # Write to stream
+        try:
+            self.soprano_stream.write(audio_chunk)
+        except Exception as e:
+            print(f"Warning: Failed to write audio chunk: {e}")
+    
+    def process_text(self, text: str):
+        """Process text through TTS and stream to virtual sink"""
+        if not text.strip():
+            return
+        
+        print(f"\n🎤 Processing: {text}", flush=True)
+        
+        # Generate and stream audio (suppress Soprano's verbose output)
+        with open(os.devnull, 'w') as devnull:
+            with redirect_stdout(devnull), redirect_stderr(devnull):
+                for audio_chunk in self.soprano.infer_stream(text):
+                    if audio_chunk is not None:
+                        self.stream_audio_chunk(audio_chunk)
+        
+        # Small silence at the end
+        silence = np.zeros(int(0.1 * self.virtual_sink_sample_rate), dtype=np.float32)
+        self.stream_audio_chunk(silence)
+        print("✓ Done\n", flush=True)
+    
+    def run(self):
+        """Run the unified pipeline"""
+        try:
+            # Setup
+            self.ensure_virtual_sink()
+            self.initialize_soprano()
+            self.start_rvc()
+            
+            print("\n" + "="*70)
+            print("UNIFIED SOPRANO TTS + RVC PIPELINE")
+            print("="*70)
+            print(f"\n✓ Ready! Voice conversion active (pitch: {self.rvc_config.pitch:+d} semitones)")
+            print("\nCommands:")
+            print("  - Type text and press Enter to generate speech")
+            print("  - Type 'quit' or 'exit' to stop")
+            print("  - Press Ctrl+C to stop")
+            print("="*70 + "\n")
+            
+            # Interactive loop
+            while self.running:
+                try:
+                    text = input("💬 > ").strip()
+                    
+                    if text.lower() in ['quit', 'exit', 'q']:
+                        break
+                    
+                    if text:
+                        self.process_text(text)
+                
+                except EOFError:
+                    break
+                except KeyboardInterrupt:
+                    break
+        
+        finally:
+            self.cleanup()
+    
+    def cleanup(self):
+        """Clean up resources"""
+        print("\n\n⏹️  Stopping...")
+        self.running = False
+        
+        if self.soprano_stream:
+            self.soprano_stream.stop()
+            self.soprano_stream.close()
+        
+        if self.rvc_thread and self.rvc_thread.is_alive():
+            self.rvc_thread.join(timeout=2)
+        
+        print("✓ Stopped")
+        print("👋 Goodbye!\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Unified Soprano TTS + RVC Pipeline",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Run with command-line arguments
+  python unified_soprano_rvc.py --pth model.pth --index model.index --pitch 0
+  
+  # Load from config file
+  python unified_soprano_rvc.py --config rvc_config.json
+  
+  # Save current config
+  python unified_soprano_rvc.py --pth model.pth --index model.index --save-config my_config.json
+        """
+    )
+    
+    # Config file options
+    parser.add_argument('--config', type=str, help='Load RVC configuration from JSON file')
+    parser.add_argument('--save-config', type=str, help='Save configuration to JSON file and exit')
+    
+    # RVC parameters
+    parser.add_argument('--pth', type=str, help='Path to RVC model (.pth file)')
+    parser.add_argument('--index', type=str, help='Path to index file')
+    parser.add_argument('--pitch', type=int, default=0, help='Pitch shift in semitones (default: 0)')
+    parser.add_argument('--formant', type=float, default=0.0, help='Formant shift (default: 0.0)')
+    parser.add_argument('--index-rate', type=float, default=0.75, help='Index rate (default: 0.75)')
+    parser.add_argument('--filter-radius', type=int, default=3, help='Filter radius (default: 3)')
+    parser.add_argument('--rms-mix-rate', type=float, default=0.25, help='RMS mix rate (default: 0.25)')
+    parser.add_argument('--protect', type=float, default=0.33, help='Protect voiceless consonants (default: 0.33)')
+    parser.add_argument('--f0method', type=str, default='rmvpe', 
+                       choices=['rmvpe', 'harvest', 'crepe', 'fcpe'],
+                       help='F0 extraction method (default: rmvpe)')
+    
+    # Audio device settings
+    parser.add_argument('--input-device', type=str, default='soprano_rvc', 
+                       help='Input audio device for RVC (default: soprano_rvc)')
+    parser.add_argument('--output-device', type=str, help='Output audio device (default: system default)')
+    parser.add_argument('--samplerate', type=int, default=48000, help='Sample rate (default: 48000)')
+    
+    # Advanced options
+    parser.add_argument('--n-cpu', type=int, default=4, help='Number of CPU cores for F0 extraction (default: 4)')
+    parser.add_argument('--threshold', type=float, default=-60.0, help='Silence threshold in dB (default: -60)')
+    parser.add_argument('--I-noise-reduce', action='store_true', help='Enable input noise reduction')
+    parser.add_argument('--O-noise-reduce', action='store_true', help='Enable output noise reduction')
+    parser.add_argument('--no-use-pv', action='store_true', help='Disable phase vocoder')
+    
+    # Virtual sink name
+    parser.add_argument('--virtual-sink', type=str, default='soprano_to_rvc',
+                       help='Name of virtual sink (default: soprano_to_rvc)')
+    
+    args = parser.parse_args()
+    
+    # Load or create config
+    if args.config:
+        print(f"Loading configuration from: {args.config}")
+        rvc_config = RVCConfig.from_file(args.config)
+    else:
+        # Validate required arguments
+        if not args.pth or not args.index:
+            parser.error("--pth and --index are required (or use --config)")
+        
+        rvc_config = RVCConfig(
+            pth=args.pth,
+            index=args.index,
+            pitch=args.pitch,
+            formant=args.formant,
+            index_rate=args.index_rate,
+            filter_radius=args.filter_radius,
+            rms_mix_rate=args.rms_mix_rate,
+            protect=args.protect,
+            f0method=args.f0method,
+            input_device=args.input_device,
+            output_device=args.output_device,
+            samplerate=args.samplerate,
+            n_cpu=args.n_cpu,
+            I_noise_reduce=args.I_noise_reduce,
+            O_noise_reduce=args.O_noise_reduce,
+            use_pv=not args.no_use_pv,
+            threshold=args.threshold
+        )
+    
+    # Save config if requested
+    if args.save_config:
+        rvc_config.to_file(args.save_config)
+        print(f"✓ Configuration saved to: {args.save_config}")
+        return
+    
+    # Run pipeline
+    pipeline = UnifiedPipeline(rvc_config, virtual_sink_name=args.virtual_sink)
+    pipeline.run()
+
+
+if __name__ == "__main__":
+    main()