#!/usr/bin/env python3 """ Unified Soprano TTS + RVC Pipeline Combines soprano_to_virtual_sink.py and headless_rvc.py into a single interface """ import sys import os import json import argparse import threading import time import subprocess import warnings import logging from pathlib import Path from dataclasses import dataclass, asdict from typing import Optional from contextlib import redirect_stdout, redirect_stderr # Configure logging - only show INFO and above logging.basicConfig( level=logging.INFO, format='%(levelname)s: %(message)s' ) # Suppress debug logs from all modules logging.getLogger().setLevel(logging.INFO) # Suppress warnings warnings.filterwarnings('ignore') os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['MIOPEN_LOG_LEVEL'] = '1' # Suppress MIOpen warnings # Add soprano to path sys.path.insert(0, str(Path(__file__).parent / "soprano")) import numpy as np import sounddevice as sd from scipy import signal as scipy_signal import torch # Import soprano from soprano import SopranoTTS @dataclass class RVCConfig: """RVC configuration parameters""" pth: str index: str pitch: int = 0 formant: float = 0.0 index_rate: float = 0.75 filter_radius: int = 3 rms_mix_rate: float = 0.25 protect: float = 0.33 f0method: str = "rmvpe" input_device: str = "soprano_rvc" output_device: Optional[str] = None samplerate: int = 48000 channels: int = 2 block_time: float = 0.25 crossfade_time: float = 0.04 extra_time: float = 2.5 n_cpu: int = 4 I_noise_reduce: bool = False O_noise_reduce: bool = False use_pv: bool = True threshold: float = -60.0 @classmethod def from_file(cls, config_path: str) -> 'RVCConfig': """Load configuration from JSON file""" with open(config_path, 'r') as f: data = json.load(f) return cls(**data) def to_file(self, config_path: str): """Save configuration to JSON file""" with open(config_path, 'w') as f: json.dump(asdict(self), f, indent=2) class UnifiedPipeline: """Unified Soprano TTS + RVC pipeline""" def __init__(self, rvc_config: RVCConfig, virtual_sink_name: str = "soprano_to_rvc"): self.rvc_config = rvc_config self.virtual_sink_name = virtual_sink_name self.soprano = None self.rvc_process = None self.rvc_thread = None self.soprano_stream = None self.running = False # Soprano audio parameters self.soprano_sample_rate = 32000 self.virtual_sink_sample_rate = 48000 def ensure_virtual_sink(self): """Ensure PulseAudio virtual sink exists""" print("Checking virtual sink...") # Check if sink exists result = subprocess.run( ["pactl", "list", "sinks", "short"], capture_output=True, text=True ) if self.virtual_sink_name not in result.stdout: print(f"Creating virtual sink: {self.virtual_sink_name}") subprocess.run([ "pactl", "load-module", "module-null-sink", f"sink_name={self.virtual_sink_name}", f"sink_properties=device.description='Soprano_to_RVC_Virtual_Sink'", f"rate={self.virtual_sink_sample_rate}", "channels=2" ]) time.sleep(0.5) else: print(f"āœ“ Virtual sink '{self.virtual_sink_name}' already exists") def initialize_soprano(self): """Initialize Soprano TTS""" print("\n" + "="*70) print("Initializing Soprano TTS...") print("="*70) # Suppress verbose output during initialization with open(os.devnull, 'w') as devnull: with redirect_stdout(devnull), redirect_stderr(devnull): self.soprano = SopranoTTS(device="cuda") # Open audio stream to virtual sink try: self.soprano_stream = sd.OutputStream( device=self.virtual_sink_name, samplerate=self.virtual_sink_sample_rate, channels=2, dtype='float32', blocksize=1024 ) self.soprano_stream.start() print("āœ“ Soprano TTS initialized successfully") print(f" Output: {self.virtual_sink_name} ({self.virtual_sink_sample_rate}Hz, stereo)") except Exception as e: print(f"āœ— Failed to open audio stream: {e}") raise def start_rvc(self): """Start headless RVC in a separate thread""" print("\n" + "="*70) print("Starting RVC Voice Conversion...") print("="*70) def run_rvc(): # Suppress logging from RVC import logging logging.getLogger('faiss').setLevel(logging.ERROR) logging.getLogger('fairseq').setLevel(logging.ERROR) # Import here to avoid conflicts from headless_rvc import HeadlessRVC, HeadlessRVCConfig # Redirect RVC output with open(os.devnull, 'w') as devnull: with redirect_stdout(devnull), redirect_stderr(devnull): # Convert our config to HeadlessRVCConfig config_dict = { 'pth_path': self.rvc_config.pth, 'index_path': self.rvc_config.index, 'pitch': self.rvc_config.pitch, 'formant': self.rvc_config.formant, 'index_rate': self.rvc_config.index_rate, 'filter_radius': self.rvc_config.filter_radius, 'rms_mix_rate': self.rvc_config.rms_mix_rate, 'protect': self.rvc_config.protect, 'f0method': self.rvc_config.f0method, 'input_device': self.rvc_config.input_device, 'output_device': self.rvc_config.output_device, 'samplerate': self.rvc_config.samplerate, 'channels': self.rvc_config.channels, 'block_time': self.rvc_config.block_time, 'crossfade_time': self.rvc_config.crossfade_time, 'extra_time': self.rvc_config.extra_time, 'n_cpu': self.rvc_config.n_cpu, 'I_noise_reduce': self.rvc_config.I_noise_reduce, 'O_noise_reduce': self.rvc_config.O_noise_reduce, 'use_pv': self.rvc_config.use_pv, 'threshold': self.rvc_config.threshold } gui_config = HeadlessRVCConfig(config_dict) self.rvc = HeadlessRVC(gui_config) self.rvc.start() # Keep running until stopped while self.running: time.sleep(0.1) # Suppress stop output too with open(os.devnull, 'w') as devnull: with redirect_stdout(devnull), redirect_stderr(devnull): self.rvc.stop() self.rvc_thread = threading.Thread(target=run_rvc, daemon=True) self.running = True self.rvc_thread.start() # Wait for RVC to initialize time.sleep(3) # Give it time to load models print("āœ“ RVC initialized successfully") def stream_audio_chunk(self, audio_chunk): """Stream an audio chunk to the virtual sink""" if audio_chunk is None: return # Convert torch tensor to numpy if needed if torch.is_tensor(audio_chunk): audio_chunk = audio_chunk.cpu().numpy() if len(audio_chunk) == 0: return # Ensure float32 audio_chunk = audio_chunk.astype(np.float32) # Resample from 32kHz to 48kHz if self.soprano_sample_rate != self.virtual_sink_sample_rate: num_samples_output = int(len(audio_chunk) * self.virtual_sink_sample_rate / self.soprano_sample_rate) audio_chunk = scipy_signal.resample(audio_chunk, num_samples_output) # Clean audio (handle NaN/inf) audio_chunk = np.nan_to_num(audio_chunk, nan=0.0, posinf=0.0, neginf=0.0) audio_chunk = np.clip(audio_chunk, -1.0, 1.0) # Convert mono to stereo if audio_chunk.ndim == 1: audio_chunk = np.column_stack((audio_chunk, audio_chunk)) # Write to stream try: self.soprano_stream.write(audio_chunk) except Exception as e: print(f"Warning: Failed to write audio chunk: {e}") def process_text(self, text: str): """Process text through TTS and stream to virtual sink""" if not text.strip(): return print(f"\nšŸŽ¤ Processing: {text}", flush=True) # Generate and stream audio (suppress Soprano's verbose output) with open(os.devnull, 'w') as devnull: with redirect_stdout(devnull), redirect_stderr(devnull): for audio_chunk in self.soprano.infer_stream(text): if audio_chunk is not None: self.stream_audio_chunk(audio_chunk) # Small silence at the end silence = np.zeros(int(0.1 * self.virtual_sink_sample_rate), dtype=np.float32) self.stream_audio_chunk(silence) print("āœ“ Done\n", flush=True) def run(self): """Run the unified pipeline""" try: # Setup self.ensure_virtual_sink() self.initialize_soprano() self.start_rvc() print("\n" + "="*70) print("UNIFIED SOPRANO TTS + RVC PIPELINE") print("="*70) print(f"\nāœ“ Ready! Voice conversion active (pitch: {self.rvc_config.pitch:+d} semitones)") print("\nCommands:") print(" - Type text and press Enter to generate speech") print(" - Type 'quit' or 'exit' to stop") print(" - Press Ctrl+C to stop") print("="*70 + "\n") # Interactive loop while self.running: try: text = input("šŸ’¬ > ").strip() if text.lower() in ['quit', 'exit', 'q']: break if text: self.process_text(text) except EOFError: break except KeyboardInterrupt: break finally: self.cleanup() def cleanup(self): """Clean up resources""" print("\n\nā¹ļø Stopping...") self.running = False if self.soprano_stream: self.soprano_stream.stop() self.soprano_stream.close() if self.rvc_thread and self.rvc_thread.is_alive(): self.rvc_thread.join(timeout=2) print("āœ“ Stopped") print("šŸ‘‹ Goodbye!\n") def main(): parser = argparse.ArgumentParser( description="Unified Soprano TTS + RVC Pipeline", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Run with command-line arguments python unified_soprano_rvc.py --pth model.pth --index model.index --pitch 0 # Load from config file python unified_soprano_rvc.py --config rvc_config.json # Save current config python unified_soprano_rvc.py --pth model.pth --index model.index --save-config my_config.json """ ) # Config file options parser.add_argument('--config', type=str, help='Load RVC configuration from JSON file') parser.add_argument('--save-config', type=str, help='Save configuration to JSON file and exit') # RVC parameters parser.add_argument('--pth', type=str, help='Path to RVC model (.pth file)') parser.add_argument('--index', type=str, help='Path to index file') parser.add_argument('--pitch', type=int, default=0, help='Pitch shift in semitones (default: 0)') parser.add_argument('--formant', type=float, default=0.0, help='Formant shift (default: 0.0)') parser.add_argument('--index-rate', type=float, default=0.75, help='Index rate (default: 0.75)') parser.add_argument('--filter-radius', type=int, default=3, help='Filter radius (default: 3)') parser.add_argument('--rms-mix-rate', type=float, default=0.25, help='RMS mix rate (default: 0.25)') parser.add_argument('--protect', type=float, default=0.33, help='Protect voiceless consonants (default: 0.33)') parser.add_argument('--f0method', type=str, default='rmvpe', choices=['rmvpe', 'harvest', 'crepe', 'fcpe'], help='F0 extraction method (default: rmvpe)') # Audio device settings parser.add_argument('--input-device', type=str, default='soprano_rvc', help='Input audio device for RVC (default: soprano_rvc)') parser.add_argument('--output-device', type=str, help='Output audio device (default: system default)') parser.add_argument('--samplerate', type=int, default=48000, help='Sample rate (default: 48000)') # Advanced options parser.add_argument('--n-cpu', type=int, default=4, help='Number of CPU cores for F0 extraction (default: 4)') parser.add_argument('--threshold', type=float, default=-60.0, help='Silence threshold in dB (default: -60)') parser.add_argument('--I-noise-reduce', action='store_true', help='Enable input noise reduction') parser.add_argument('--O-noise-reduce', action='store_true', help='Enable output noise reduction') parser.add_argument('--no-use-pv', action='store_true', help='Disable phase vocoder') # Virtual sink name parser.add_argument('--virtual-sink', type=str, default='soprano_to_rvc', help='Name of virtual sink (default: soprano_to_rvc)') args = parser.parse_args() # Load or create config if args.config: print(f"Loading configuration from: {args.config}") rvc_config = RVCConfig.from_file(args.config) else: # Validate required arguments if not args.pth or not args.index: parser.error("--pth and --index are required (or use --config)") rvc_config = RVCConfig( pth=args.pth, index=args.index, pitch=args.pitch, formant=args.formant, index_rate=args.index_rate, filter_radius=args.filter_radius, rms_mix_rate=args.rms_mix_rate, protect=args.protect, f0method=args.f0method, input_device=args.input_device, output_device=args.output_device, samplerate=args.samplerate, n_cpu=args.n_cpu, I_noise_reduce=args.I_noise_reduce, O_noise_reduce=args.O_noise_reduce, use_pv=not args.no_use_pv, threshold=args.threshold ) # Save config if requested if args.save_config: rvc_config.to_file(args.save_config) print(f"āœ“ Configuration saved to: {args.save_config}") return # Run pipeline pipeline = UnifiedPipeline(rvc_config, virtual_sink_name=args.virtual_sink) pipeline.run() if __name__ == "__main__": main()