unified soprano to rvc script
This commit is contained in:
BIN
__pycache__/headless_rvc.cpython-310.pyc
Normal file
BIN
__pycache__/headless_rvc.cpython-310.pyc
Normal file
Binary file not shown.
@@ -9,8 +9,17 @@ import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
import atexit
|
||||
from pathlib import Path
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(levelname)s: %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Set up environment (same as GUI)
|
||||
os.environ["OMP_NUM_THREADS"] = "4"
|
||||
if sys.platform == "darwin":
|
||||
@@ -41,6 +50,7 @@ from configs.config import Config
|
||||
inp_q = Queue()
|
||||
opt_q = Queue()
|
||||
n_cpu = min(cpu_count(), 8)
|
||||
harvest_processes = [] # Keep track of processes for cleanup
|
||||
|
||||
|
||||
class Harvest(multiprocessing.Process):
|
||||
@@ -71,6 +81,23 @@ for _ in range(n_cpu):
|
||||
p = Harvest(inp_q, opt_q)
|
||||
p.daemon = True
|
||||
p.start()
|
||||
harvest_processes.append(p)
|
||||
|
||||
|
||||
def cleanup_harvest_processes():
|
||||
"""Terminate all harvest processes gracefully"""
|
||||
global harvest_processes
|
||||
for p in harvest_processes:
|
||||
if p.is_alive():
|
||||
p.terminate()
|
||||
# Wait briefly for processes to terminate
|
||||
for p in harvest_processes:
|
||||
p.join(timeout=0.1)
|
||||
harvest_processes.clear()
|
||||
|
||||
|
||||
# Register cleanup to run on exit
|
||||
atexit.register(cleanup_harvest_processes)
|
||||
|
||||
|
||||
def phase_vocoder(a, b, fade_out, fade_in):
|
||||
@@ -344,7 +371,6 @@ class HeadlessRVC:
|
||||
self.input_wav[:-self.block_frame] = self.input_wav[self.block_frame:].clone()
|
||||
self.input_wav[-indata.shape[0]:] = torch.from_numpy(indata).to(self.config.device)
|
||||
self.input_wav_res[:-self.block_frame_16k] = self.input_wav_res[self.block_frame_16k:].clone()
|
||||
|
||||
|
||||
# Input noise reduction
|
||||
if self.gui_config.I_noise_reduce:
|
||||
@@ -458,7 +484,7 @@ class HeadlessRVC:
|
||||
)
|
||||
|
||||
total_time = time.perf_counter() - start_time
|
||||
print(f"Infer time: {total_time:.2f}s")
|
||||
logger.debug(f"Infer time: {total_time:.2f}s")
|
||||
|
||||
finally:
|
||||
# Restore directory
|
||||
@@ -500,6 +526,9 @@ class HeadlessRVC:
|
||||
self.stream.abort()
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
|
||||
# Clean up harvest processes
|
||||
cleanup_harvest_processes()
|
||||
print("✓ Audio stream stopped")
|
||||
|
||||
|
||||
|
||||
23
rvc_config.json
Normal file
23
rvc_config.json
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"pth": "Retrieval-based-Voice-Conversion-WebUI/assets/weights/MikuAI_e210_s6300.pth",
|
||||
"index": "Retrieval-based-Voice-Conversion-WebUI/assets/indices/added_IVF1811_Flat_nprobe_1_MikuAI_v2.index",
|
||||
"pitch": 5,
|
||||
"formant": 0.0,
|
||||
"index_rate": 0.3,
|
||||
"filter_radius": 3,
|
||||
"rms_mix_rate": 0.25,
|
||||
"protect": 0.33,
|
||||
"f0method": "rmvpe",
|
||||
"input_device": "soprano_rvc",
|
||||
"output_device": null,
|
||||
"samplerate": 48000,
|
||||
"channels": 2,
|
||||
"block_time": 0.25,
|
||||
"crossfade_time": 0.04,
|
||||
"extra_time": 2.5,
|
||||
"n_cpu": 4,
|
||||
"I_noise_reduce": false,
|
||||
"O_noise_reduce": false,
|
||||
"use_pv": false,
|
||||
"threshold": -60.0
|
||||
}
|
||||
415
unified_soprano_rvc.py
Normal file
415
unified_soprano_rvc.py
Normal file
@@ -0,0 +1,415 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Unified Soprano TTS + RVC Pipeline
|
||||
Combines soprano_to_virtual_sink.py and headless_rvc.py into a single interface
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import threading
|
||||
import time
|
||||
import subprocess
|
||||
import warnings
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Optional
|
||||
from contextlib import redirect_stdout, redirect_stderr
|
||||
|
||||
# Configure logging - only show INFO and above
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(levelname)s: %(message)s'
|
||||
)
|
||||
# Suppress debug logs from all modules
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
# Suppress warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||||
os.environ['MIOPEN_LOG_LEVEL'] = '1' # Suppress MIOpen warnings
|
||||
|
||||
# Add soprano to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "soprano"))
|
||||
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
from scipy import signal as scipy_signal
|
||||
import torch
|
||||
|
||||
# Import soprano
|
||||
from soprano import SopranoTTS
|
||||
|
||||
|
||||
@dataclass
|
||||
class RVCConfig:
|
||||
"""RVC configuration parameters"""
|
||||
pth: str
|
||||
index: str
|
||||
pitch: int = 0
|
||||
formant: float = 0.0
|
||||
index_rate: float = 0.75
|
||||
filter_radius: int = 3
|
||||
rms_mix_rate: float = 0.25
|
||||
protect: float = 0.33
|
||||
f0method: str = "rmvpe"
|
||||
input_device: str = "soprano_rvc"
|
||||
output_device: Optional[str] = None
|
||||
samplerate: int = 48000
|
||||
channels: int = 2
|
||||
block_time: float = 0.25
|
||||
crossfade_time: float = 0.04
|
||||
extra_time: float = 2.5
|
||||
n_cpu: int = 4
|
||||
I_noise_reduce: bool = False
|
||||
O_noise_reduce: bool = False
|
||||
use_pv: bool = True
|
||||
threshold: float = -60.0
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, config_path: str) -> 'RVCConfig':
|
||||
"""Load configuration from JSON file"""
|
||||
with open(config_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
return cls(**data)
|
||||
|
||||
def to_file(self, config_path: str):
|
||||
"""Save configuration to JSON file"""
|
||||
with open(config_path, 'w') as f:
|
||||
json.dump(asdict(self), f, indent=2)
|
||||
|
||||
|
||||
class UnifiedPipeline:
|
||||
"""Unified Soprano TTS + RVC pipeline"""
|
||||
|
||||
def __init__(self, rvc_config: RVCConfig, virtual_sink_name: str = "soprano_to_rvc"):
|
||||
self.rvc_config = rvc_config
|
||||
self.virtual_sink_name = virtual_sink_name
|
||||
self.soprano = None
|
||||
self.rvc_process = None
|
||||
self.rvc_thread = None
|
||||
self.soprano_stream = None
|
||||
self.running = False
|
||||
|
||||
# Soprano audio parameters
|
||||
self.soprano_sample_rate = 32000
|
||||
self.virtual_sink_sample_rate = 48000
|
||||
|
||||
def ensure_virtual_sink(self):
|
||||
"""Ensure PulseAudio virtual sink exists"""
|
||||
print("Checking virtual sink...")
|
||||
|
||||
# Check if sink exists
|
||||
result = subprocess.run(
|
||||
["pactl", "list", "sinks", "short"],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if self.virtual_sink_name not in result.stdout:
|
||||
print(f"Creating virtual sink: {self.virtual_sink_name}")
|
||||
subprocess.run([
|
||||
"pactl", "load-module", "module-null-sink",
|
||||
f"sink_name={self.virtual_sink_name}",
|
||||
f"sink_properties=device.description='Soprano_to_RVC_Virtual_Sink'",
|
||||
f"rate={self.virtual_sink_sample_rate}",
|
||||
"channels=2"
|
||||
])
|
||||
time.sleep(0.5)
|
||||
else:
|
||||
print(f"✓ Virtual sink '{self.virtual_sink_name}' already exists")
|
||||
|
||||
def initialize_soprano(self):
|
||||
"""Initialize Soprano TTS"""
|
||||
print("\n" + "="*70)
|
||||
print("Initializing Soprano TTS...")
|
||||
print("="*70)
|
||||
|
||||
# Suppress verbose output during initialization
|
||||
with open(os.devnull, 'w') as devnull:
|
||||
with redirect_stdout(devnull), redirect_stderr(devnull):
|
||||
self.soprano = SopranoTTS(device="cuda")
|
||||
|
||||
# Open audio stream to virtual sink
|
||||
try:
|
||||
self.soprano_stream = sd.OutputStream(
|
||||
device=self.virtual_sink_name,
|
||||
samplerate=self.virtual_sink_sample_rate,
|
||||
channels=2,
|
||||
dtype='float32',
|
||||
blocksize=1024
|
||||
)
|
||||
self.soprano_stream.start()
|
||||
print("✓ Soprano TTS initialized successfully")
|
||||
print(f" Output: {self.virtual_sink_name} ({self.virtual_sink_sample_rate}Hz, stereo)")
|
||||
except Exception as e:
|
||||
print(f"✗ Failed to open audio stream: {e}")
|
||||
raise
|
||||
|
||||
def start_rvc(self):
|
||||
"""Start headless RVC in a separate thread"""
|
||||
print("\n" + "="*70)
|
||||
print("Starting RVC Voice Conversion...")
|
||||
print("="*70)
|
||||
|
||||
def run_rvc():
|
||||
# Suppress logging from RVC
|
||||
import logging
|
||||
logging.getLogger('faiss').setLevel(logging.ERROR)
|
||||
logging.getLogger('fairseq').setLevel(logging.ERROR)
|
||||
|
||||
# Import here to avoid conflicts
|
||||
from headless_rvc import HeadlessRVC, HeadlessRVCConfig
|
||||
|
||||
# Redirect RVC output
|
||||
with open(os.devnull, 'w') as devnull:
|
||||
with redirect_stdout(devnull), redirect_stderr(devnull):
|
||||
# Convert our config to HeadlessRVCConfig
|
||||
config_dict = {
|
||||
'pth_path': self.rvc_config.pth,
|
||||
'index_path': self.rvc_config.index,
|
||||
'pitch': self.rvc_config.pitch,
|
||||
'formant': self.rvc_config.formant,
|
||||
'index_rate': self.rvc_config.index_rate,
|
||||
'filter_radius': self.rvc_config.filter_radius,
|
||||
'rms_mix_rate': self.rvc_config.rms_mix_rate,
|
||||
'protect': self.rvc_config.protect,
|
||||
'f0method': self.rvc_config.f0method,
|
||||
'input_device': self.rvc_config.input_device,
|
||||
'output_device': self.rvc_config.output_device,
|
||||
'samplerate': self.rvc_config.samplerate,
|
||||
'channels': self.rvc_config.channels,
|
||||
'block_time': self.rvc_config.block_time,
|
||||
'crossfade_time': self.rvc_config.crossfade_time,
|
||||
'extra_time': self.rvc_config.extra_time,
|
||||
'n_cpu': self.rvc_config.n_cpu,
|
||||
'I_noise_reduce': self.rvc_config.I_noise_reduce,
|
||||
'O_noise_reduce': self.rvc_config.O_noise_reduce,
|
||||
'use_pv': self.rvc_config.use_pv,
|
||||
'threshold': self.rvc_config.threshold
|
||||
}
|
||||
gui_config = HeadlessRVCConfig(config_dict)
|
||||
|
||||
self.rvc = HeadlessRVC(gui_config)
|
||||
self.rvc.start()
|
||||
|
||||
# Keep running until stopped
|
||||
while self.running:
|
||||
time.sleep(0.1)
|
||||
|
||||
# Suppress stop output too
|
||||
with open(os.devnull, 'w') as devnull:
|
||||
with redirect_stdout(devnull), redirect_stderr(devnull):
|
||||
self.rvc.stop()
|
||||
|
||||
self.rvc_thread = threading.Thread(target=run_rvc, daemon=True)
|
||||
self.running = True
|
||||
self.rvc_thread.start()
|
||||
|
||||
# Wait for RVC to initialize
|
||||
time.sleep(3) # Give it time to load models
|
||||
print("✓ RVC initialized successfully")
|
||||
|
||||
def stream_audio_chunk(self, audio_chunk):
|
||||
"""Stream an audio chunk to the virtual sink"""
|
||||
if audio_chunk is None:
|
||||
return
|
||||
|
||||
# Convert torch tensor to numpy if needed
|
||||
if torch.is_tensor(audio_chunk):
|
||||
audio_chunk = audio_chunk.cpu().numpy()
|
||||
|
||||
if len(audio_chunk) == 0:
|
||||
return
|
||||
|
||||
# Ensure float32
|
||||
audio_chunk = audio_chunk.astype(np.float32)
|
||||
|
||||
# Resample from 32kHz to 48kHz
|
||||
if self.soprano_sample_rate != self.virtual_sink_sample_rate:
|
||||
num_samples_output = int(len(audio_chunk) * self.virtual_sink_sample_rate / self.soprano_sample_rate)
|
||||
audio_chunk = scipy_signal.resample(audio_chunk, num_samples_output)
|
||||
|
||||
# Clean audio (handle NaN/inf)
|
||||
audio_chunk = np.nan_to_num(audio_chunk, nan=0.0, posinf=0.0, neginf=0.0)
|
||||
audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
|
||||
|
||||
# Convert mono to stereo
|
||||
if audio_chunk.ndim == 1:
|
||||
audio_chunk = np.column_stack((audio_chunk, audio_chunk))
|
||||
|
||||
# Write to stream
|
||||
try:
|
||||
self.soprano_stream.write(audio_chunk)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to write audio chunk: {e}")
|
||||
|
||||
def process_text(self, text: str):
|
||||
"""Process text through TTS and stream to virtual sink"""
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
print(f"\n🎤 Processing: {text}", flush=True)
|
||||
|
||||
# Generate and stream audio (suppress Soprano's verbose output)
|
||||
with open(os.devnull, 'w') as devnull:
|
||||
with redirect_stdout(devnull), redirect_stderr(devnull):
|
||||
for audio_chunk in self.soprano.infer_stream(text):
|
||||
if audio_chunk is not None:
|
||||
self.stream_audio_chunk(audio_chunk)
|
||||
|
||||
# Small silence at the end
|
||||
silence = np.zeros(int(0.1 * self.virtual_sink_sample_rate), dtype=np.float32)
|
||||
self.stream_audio_chunk(silence)
|
||||
print("✓ Done\n", flush=True)
|
||||
|
||||
def run(self):
|
||||
"""Run the unified pipeline"""
|
||||
try:
|
||||
# Setup
|
||||
self.ensure_virtual_sink()
|
||||
self.initialize_soprano()
|
||||
self.start_rvc()
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("UNIFIED SOPRANO TTS + RVC PIPELINE")
|
||||
print("="*70)
|
||||
print(f"\n✓ Ready! Voice conversion active (pitch: {self.rvc_config.pitch:+d} semitones)")
|
||||
print("\nCommands:")
|
||||
print(" - Type text and press Enter to generate speech")
|
||||
print(" - Type 'quit' or 'exit' to stop")
|
||||
print(" - Press Ctrl+C to stop")
|
||||
print("="*70 + "\n")
|
||||
|
||||
# Interactive loop
|
||||
while self.running:
|
||||
try:
|
||||
text = input("💬 > ").strip()
|
||||
|
||||
if text.lower() in ['quit', 'exit', 'q']:
|
||||
break
|
||||
|
||||
if text:
|
||||
self.process_text(text)
|
||||
|
||||
except EOFError:
|
||||
break
|
||||
except KeyboardInterrupt:
|
||||
break
|
||||
|
||||
finally:
|
||||
self.cleanup()
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up resources"""
|
||||
print("\n\n⏹️ Stopping...")
|
||||
self.running = False
|
||||
|
||||
if self.soprano_stream:
|
||||
self.soprano_stream.stop()
|
||||
self.soprano_stream.close()
|
||||
|
||||
if self.rvc_thread and self.rvc_thread.is_alive():
|
||||
self.rvc_thread.join(timeout=2)
|
||||
|
||||
print("✓ Stopped")
|
||||
print("👋 Goodbye!\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Unified Soprano TTS + RVC Pipeline",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Run with command-line arguments
|
||||
python unified_soprano_rvc.py --pth model.pth --index model.index --pitch 0
|
||||
|
||||
# Load from config file
|
||||
python unified_soprano_rvc.py --config rvc_config.json
|
||||
|
||||
# Save current config
|
||||
python unified_soprano_rvc.py --pth model.pth --index model.index --save-config my_config.json
|
||||
"""
|
||||
)
|
||||
|
||||
# Config file options
|
||||
parser.add_argument('--config', type=str, help='Load RVC configuration from JSON file')
|
||||
parser.add_argument('--save-config', type=str, help='Save configuration to JSON file and exit')
|
||||
|
||||
# RVC parameters
|
||||
parser.add_argument('--pth', type=str, help='Path to RVC model (.pth file)')
|
||||
parser.add_argument('--index', type=str, help='Path to index file')
|
||||
parser.add_argument('--pitch', type=int, default=0, help='Pitch shift in semitones (default: 0)')
|
||||
parser.add_argument('--formant', type=float, default=0.0, help='Formant shift (default: 0.0)')
|
||||
parser.add_argument('--index-rate', type=float, default=0.75, help='Index rate (default: 0.75)')
|
||||
parser.add_argument('--filter-radius', type=int, default=3, help='Filter radius (default: 3)')
|
||||
parser.add_argument('--rms-mix-rate', type=float, default=0.25, help='RMS mix rate (default: 0.25)')
|
||||
parser.add_argument('--protect', type=float, default=0.33, help='Protect voiceless consonants (default: 0.33)')
|
||||
parser.add_argument('--f0method', type=str, default='rmvpe',
|
||||
choices=['rmvpe', 'harvest', 'crepe', 'fcpe'],
|
||||
help='F0 extraction method (default: rmvpe)')
|
||||
|
||||
# Audio device settings
|
||||
parser.add_argument('--input-device', type=str, default='soprano_rvc',
|
||||
help='Input audio device for RVC (default: soprano_rvc)')
|
||||
parser.add_argument('--output-device', type=str, help='Output audio device (default: system default)')
|
||||
parser.add_argument('--samplerate', type=int, default=48000, help='Sample rate (default: 48000)')
|
||||
|
||||
# Advanced options
|
||||
parser.add_argument('--n-cpu', type=int, default=4, help='Number of CPU cores for F0 extraction (default: 4)')
|
||||
parser.add_argument('--threshold', type=float, default=-60.0, help='Silence threshold in dB (default: -60)')
|
||||
parser.add_argument('--I-noise-reduce', action='store_true', help='Enable input noise reduction')
|
||||
parser.add_argument('--O-noise-reduce', action='store_true', help='Enable output noise reduction')
|
||||
parser.add_argument('--no-use-pv', action='store_true', help='Disable phase vocoder')
|
||||
|
||||
# Virtual sink name
|
||||
parser.add_argument('--virtual-sink', type=str, default='soprano_to_rvc',
|
||||
help='Name of virtual sink (default: soprano_to_rvc)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load or create config
|
||||
if args.config:
|
||||
print(f"Loading configuration from: {args.config}")
|
||||
rvc_config = RVCConfig.from_file(args.config)
|
||||
else:
|
||||
# Validate required arguments
|
||||
if not args.pth or not args.index:
|
||||
parser.error("--pth and --index are required (or use --config)")
|
||||
|
||||
rvc_config = RVCConfig(
|
||||
pth=args.pth,
|
||||
index=args.index,
|
||||
pitch=args.pitch,
|
||||
formant=args.formant,
|
||||
index_rate=args.index_rate,
|
||||
filter_radius=args.filter_radius,
|
||||
rms_mix_rate=args.rms_mix_rate,
|
||||
protect=args.protect,
|
||||
f0method=args.f0method,
|
||||
input_device=args.input_device,
|
||||
output_device=args.output_device,
|
||||
samplerate=args.samplerate,
|
||||
n_cpu=args.n_cpu,
|
||||
I_noise_reduce=args.I_noise_reduce,
|
||||
O_noise_reduce=args.O_noise_reduce,
|
||||
use_pv=not args.no_use_pv,
|
||||
threshold=args.threshold
|
||||
)
|
||||
|
||||
# Save config if requested
|
||||
if args.save_config:
|
||||
rvc_config.to_file(args.save_config)
|
||||
print(f"✓ Configuration saved to: {args.save_config}")
|
||||
return
|
||||
|
||||
# Run pipeline
|
||||
pipeline = UnifiedPipeline(rvc_config, virtual_sink_name=args.virtual_sink)
|
||||
pipeline.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user