unified soprano to rvc script

This commit is contained in:
2026-01-13 00:20:55 +02:00
parent 5eedbb80e4
commit 346f9ccbda
4 changed files with 469 additions and 2 deletions

415
unified_soprano_rvc.py Normal file
View File

@@ -0,0 +1,415 @@
#!/usr/bin/env python3
"""
Unified Soprano TTS + RVC Pipeline
Combines soprano_to_virtual_sink.py and headless_rvc.py into a single interface
"""
import sys
import os
import json
import argparse
import threading
import time
import subprocess
import warnings
import logging
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Optional
from contextlib import redirect_stdout, redirect_stderr
# Configure logging - only show INFO and above
logging.basicConfig(
level=logging.INFO,
format='%(levelname)s: %(message)s'
)
# Suppress debug logs from all modules
logging.getLogger().setLevel(logging.INFO)
# Suppress warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['MIOPEN_LOG_LEVEL'] = '1' # Suppress MIOpen warnings
# Add soprano to path
sys.path.insert(0, str(Path(__file__).parent / "soprano"))
import numpy as np
import sounddevice as sd
from scipy import signal as scipy_signal
import torch
# Import soprano
from soprano import SopranoTTS
@dataclass
class RVCConfig:
"""RVC configuration parameters"""
pth: str
index: str
pitch: int = 0
formant: float = 0.0
index_rate: float = 0.75
filter_radius: int = 3
rms_mix_rate: float = 0.25
protect: float = 0.33
f0method: str = "rmvpe"
input_device: str = "soprano_rvc"
output_device: Optional[str] = None
samplerate: int = 48000
channels: int = 2
block_time: float = 0.25
crossfade_time: float = 0.04
extra_time: float = 2.5
n_cpu: int = 4
I_noise_reduce: bool = False
O_noise_reduce: bool = False
use_pv: bool = True
threshold: float = -60.0
@classmethod
def from_file(cls, config_path: str) -> 'RVCConfig':
"""Load configuration from JSON file"""
with open(config_path, 'r') as f:
data = json.load(f)
return cls(**data)
def to_file(self, config_path: str):
"""Save configuration to JSON file"""
with open(config_path, 'w') as f:
json.dump(asdict(self), f, indent=2)
class UnifiedPipeline:
"""Unified Soprano TTS + RVC pipeline"""
def __init__(self, rvc_config: RVCConfig, virtual_sink_name: str = "soprano_to_rvc"):
self.rvc_config = rvc_config
self.virtual_sink_name = virtual_sink_name
self.soprano = None
self.rvc_process = None
self.rvc_thread = None
self.soprano_stream = None
self.running = False
# Soprano audio parameters
self.soprano_sample_rate = 32000
self.virtual_sink_sample_rate = 48000
def ensure_virtual_sink(self):
"""Ensure PulseAudio virtual sink exists"""
print("Checking virtual sink...")
# Check if sink exists
result = subprocess.run(
["pactl", "list", "sinks", "short"],
capture_output=True,
text=True
)
if self.virtual_sink_name not in result.stdout:
print(f"Creating virtual sink: {self.virtual_sink_name}")
subprocess.run([
"pactl", "load-module", "module-null-sink",
f"sink_name={self.virtual_sink_name}",
f"sink_properties=device.description='Soprano_to_RVC_Virtual_Sink'",
f"rate={self.virtual_sink_sample_rate}",
"channels=2"
])
time.sleep(0.5)
else:
print(f"✓ Virtual sink '{self.virtual_sink_name}' already exists")
def initialize_soprano(self):
"""Initialize Soprano TTS"""
print("\n" + "="*70)
print("Initializing Soprano TTS...")
print("="*70)
# Suppress verbose output during initialization
with open(os.devnull, 'w') as devnull:
with redirect_stdout(devnull), redirect_stderr(devnull):
self.soprano = SopranoTTS(device="cuda")
# Open audio stream to virtual sink
try:
self.soprano_stream = sd.OutputStream(
device=self.virtual_sink_name,
samplerate=self.virtual_sink_sample_rate,
channels=2,
dtype='float32',
blocksize=1024
)
self.soprano_stream.start()
print("✓ Soprano TTS initialized successfully")
print(f" Output: {self.virtual_sink_name} ({self.virtual_sink_sample_rate}Hz, stereo)")
except Exception as e:
print(f"✗ Failed to open audio stream: {e}")
raise
def start_rvc(self):
"""Start headless RVC in a separate thread"""
print("\n" + "="*70)
print("Starting RVC Voice Conversion...")
print("="*70)
def run_rvc():
# Suppress logging from RVC
import logging
logging.getLogger('faiss').setLevel(logging.ERROR)
logging.getLogger('fairseq').setLevel(logging.ERROR)
# Import here to avoid conflicts
from headless_rvc import HeadlessRVC, HeadlessRVCConfig
# Redirect RVC output
with open(os.devnull, 'w') as devnull:
with redirect_stdout(devnull), redirect_stderr(devnull):
# Convert our config to HeadlessRVCConfig
config_dict = {
'pth_path': self.rvc_config.pth,
'index_path': self.rvc_config.index,
'pitch': self.rvc_config.pitch,
'formant': self.rvc_config.formant,
'index_rate': self.rvc_config.index_rate,
'filter_radius': self.rvc_config.filter_radius,
'rms_mix_rate': self.rvc_config.rms_mix_rate,
'protect': self.rvc_config.protect,
'f0method': self.rvc_config.f0method,
'input_device': self.rvc_config.input_device,
'output_device': self.rvc_config.output_device,
'samplerate': self.rvc_config.samplerate,
'channels': self.rvc_config.channels,
'block_time': self.rvc_config.block_time,
'crossfade_time': self.rvc_config.crossfade_time,
'extra_time': self.rvc_config.extra_time,
'n_cpu': self.rvc_config.n_cpu,
'I_noise_reduce': self.rvc_config.I_noise_reduce,
'O_noise_reduce': self.rvc_config.O_noise_reduce,
'use_pv': self.rvc_config.use_pv,
'threshold': self.rvc_config.threshold
}
gui_config = HeadlessRVCConfig(config_dict)
self.rvc = HeadlessRVC(gui_config)
self.rvc.start()
# Keep running until stopped
while self.running:
time.sleep(0.1)
# Suppress stop output too
with open(os.devnull, 'w') as devnull:
with redirect_stdout(devnull), redirect_stderr(devnull):
self.rvc.stop()
self.rvc_thread = threading.Thread(target=run_rvc, daemon=True)
self.running = True
self.rvc_thread.start()
# Wait for RVC to initialize
time.sleep(3) # Give it time to load models
print("✓ RVC initialized successfully")
def stream_audio_chunk(self, audio_chunk):
"""Stream an audio chunk to the virtual sink"""
if audio_chunk is None:
return
# Convert torch tensor to numpy if needed
if torch.is_tensor(audio_chunk):
audio_chunk = audio_chunk.cpu().numpy()
if len(audio_chunk) == 0:
return
# Ensure float32
audio_chunk = audio_chunk.astype(np.float32)
# Resample from 32kHz to 48kHz
if self.soprano_sample_rate != self.virtual_sink_sample_rate:
num_samples_output = int(len(audio_chunk) * self.virtual_sink_sample_rate / self.soprano_sample_rate)
audio_chunk = scipy_signal.resample(audio_chunk, num_samples_output)
# Clean audio (handle NaN/inf)
audio_chunk = np.nan_to_num(audio_chunk, nan=0.0, posinf=0.0, neginf=0.0)
audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
# Convert mono to stereo
if audio_chunk.ndim == 1:
audio_chunk = np.column_stack((audio_chunk, audio_chunk))
# Write to stream
try:
self.soprano_stream.write(audio_chunk)
except Exception as e:
print(f"Warning: Failed to write audio chunk: {e}")
def process_text(self, text: str):
"""Process text through TTS and stream to virtual sink"""
if not text.strip():
return
print(f"\n🎤 Processing: {text}", flush=True)
# Generate and stream audio (suppress Soprano's verbose output)
with open(os.devnull, 'w') as devnull:
with redirect_stdout(devnull), redirect_stderr(devnull):
for audio_chunk in self.soprano.infer_stream(text):
if audio_chunk is not None:
self.stream_audio_chunk(audio_chunk)
# Small silence at the end
silence = np.zeros(int(0.1 * self.virtual_sink_sample_rate), dtype=np.float32)
self.stream_audio_chunk(silence)
print("✓ Done\n", flush=True)
def run(self):
"""Run the unified pipeline"""
try:
# Setup
self.ensure_virtual_sink()
self.initialize_soprano()
self.start_rvc()
print("\n" + "="*70)
print("UNIFIED SOPRANO TTS + RVC PIPELINE")
print("="*70)
print(f"\n✓ Ready! Voice conversion active (pitch: {self.rvc_config.pitch:+d} semitones)")
print("\nCommands:")
print(" - Type text and press Enter to generate speech")
print(" - Type 'quit' or 'exit' to stop")
print(" - Press Ctrl+C to stop")
print("="*70 + "\n")
# Interactive loop
while self.running:
try:
text = input("💬 > ").strip()
if text.lower() in ['quit', 'exit', 'q']:
break
if text:
self.process_text(text)
except EOFError:
break
except KeyboardInterrupt:
break
finally:
self.cleanup()
def cleanup(self):
"""Clean up resources"""
print("\n\n⏹️ Stopping...")
self.running = False
if self.soprano_stream:
self.soprano_stream.stop()
self.soprano_stream.close()
if self.rvc_thread and self.rvc_thread.is_alive():
self.rvc_thread.join(timeout=2)
print("✓ Stopped")
print("👋 Goodbye!\n")
def main():
parser = argparse.ArgumentParser(
description="Unified Soprano TTS + RVC Pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Run with command-line arguments
python unified_soprano_rvc.py --pth model.pth --index model.index --pitch 0
# Load from config file
python unified_soprano_rvc.py --config rvc_config.json
# Save current config
python unified_soprano_rvc.py --pth model.pth --index model.index --save-config my_config.json
"""
)
# Config file options
parser.add_argument('--config', type=str, help='Load RVC configuration from JSON file')
parser.add_argument('--save-config', type=str, help='Save configuration to JSON file and exit')
# RVC parameters
parser.add_argument('--pth', type=str, help='Path to RVC model (.pth file)')
parser.add_argument('--index', type=str, help='Path to index file')
parser.add_argument('--pitch', type=int, default=0, help='Pitch shift in semitones (default: 0)')
parser.add_argument('--formant', type=float, default=0.0, help='Formant shift (default: 0.0)')
parser.add_argument('--index-rate', type=float, default=0.75, help='Index rate (default: 0.75)')
parser.add_argument('--filter-radius', type=int, default=3, help='Filter radius (default: 3)')
parser.add_argument('--rms-mix-rate', type=float, default=0.25, help='RMS mix rate (default: 0.25)')
parser.add_argument('--protect', type=float, default=0.33, help='Protect voiceless consonants (default: 0.33)')
parser.add_argument('--f0method', type=str, default='rmvpe',
choices=['rmvpe', 'harvest', 'crepe', 'fcpe'],
help='F0 extraction method (default: rmvpe)')
# Audio device settings
parser.add_argument('--input-device', type=str, default='soprano_rvc',
help='Input audio device for RVC (default: soprano_rvc)')
parser.add_argument('--output-device', type=str, help='Output audio device (default: system default)')
parser.add_argument('--samplerate', type=int, default=48000, help='Sample rate (default: 48000)')
# Advanced options
parser.add_argument('--n-cpu', type=int, default=4, help='Number of CPU cores for F0 extraction (default: 4)')
parser.add_argument('--threshold', type=float, default=-60.0, help='Silence threshold in dB (default: -60)')
parser.add_argument('--I-noise-reduce', action='store_true', help='Enable input noise reduction')
parser.add_argument('--O-noise-reduce', action='store_true', help='Enable output noise reduction')
parser.add_argument('--no-use-pv', action='store_true', help='Disable phase vocoder')
# Virtual sink name
parser.add_argument('--virtual-sink', type=str, default='soprano_to_rvc',
help='Name of virtual sink (default: soprano_to_rvc)')
args = parser.parse_args()
# Load or create config
if args.config:
print(f"Loading configuration from: {args.config}")
rvc_config = RVCConfig.from_file(args.config)
else:
# Validate required arguments
if not args.pth or not args.index:
parser.error("--pth and --index are required (or use --config)")
rvc_config = RVCConfig(
pth=args.pth,
index=args.index,
pitch=args.pitch,
formant=args.formant,
index_rate=args.index_rate,
filter_radius=args.filter_radius,
rms_mix_rate=args.rms_mix_rate,
protect=args.protect,
f0method=args.f0method,
input_device=args.input_device,
output_device=args.output_device,
samplerate=args.samplerate,
n_cpu=args.n_cpu,
I_noise_reduce=args.I_noise_reduce,
O_noise_reduce=args.O_noise_reduce,
use_pv=not args.no_use_pv,
threshold=args.threshold
)
# Save config if requested
if args.save_config:
rvc_config.to_file(args.save_config)
print(f"✓ Configuration saved to: {args.save_config}")
return
# Run pipeline
pipeline = UnifiedPipeline(rvc_config, virtual_sink_name=args.virtual_sink)
pipeline.run()
if __name__ == "__main__":
main()