diff --git a/__pycache__/headless_rvc.cpython-310.pyc b/__pycache__/headless_rvc.cpython-310.pyc new file mode 100644 index 0000000..378b022 Binary files /dev/null and b/__pycache__/headless_rvc.cpython-310.pyc differ diff --git a/headless_rvc.py b/headless_rvc.py index 2c2f362..f728765 100755 --- a/headless_rvc.py +++ b/headless_rvc.py @@ -9,8 +9,17 @@ import os import sys import json import argparse +import logging +import atexit from pathlib import Path +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(levelname)s: %(message)s' +) +logger = logging.getLogger(__name__) + # Set up environment (same as GUI) os.environ["OMP_NUM_THREADS"] = "4" if sys.platform == "darwin": @@ -41,6 +50,7 @@ from configs.config import Config inp_q = Queue() opt_q = Queue() n_cpu = min(cpu_count(), 8) +harvest_processes = [] # Keep track of processes for cleanup class Harvest(multiprocessing.Process): @@ -71,6 +81,23 @@ for _ in range(n_cpu): p = Harvest(inp_q, opt_q) p.daemon = True p.start() + harvest_processes.append(p) + + +def cleanup_harvest_processes(): + """Terminate all harvest processes gracefully""" + global harvest_processes + for p in harvest_processes: + if p.is_alive(): + p.terminate() + # Wait briefly for processes to terminate + for p in harvest_processes: + p.join(timeout=0.1) + harvest_processes.clear() + + +# Register cleanup to run on exit +atexit.register(cleanup_harvest_processes) def phase_vocoder(a, b, fade_out, fade_in): @@ -344,7 +371,6 @@ class HeadlessRVC: self.input_wav[:-self.block_frame] = self.input_wav[self.block_frame:].clone() self.input_wav[-indata.shape[0]:] = torch.from_numpy(indata).to(self.config.device) self.input_wav_res[:-self.block_frame_16k] = self.input_wav_res[self.block_frame_16k:].clone() - # Input noise reduction if self.gui_config.I_noise_reduce: @@ -458,7 +484,7 @@ class HeadlessRVC: ) total_time = time.perf_counter() - start_time - print(f"Infer time: {total_time:.2f}s") + logger.debug(f"Infer time: {total_time:.2f}s") finally: # Restore directory @@ -500,6 +526,9 @@ class HeadlessRVC: self.stream.abort() self.stream.close() self.stream = None + + # Clean up harvest processes + cleanup_harvest_processes() print("✓ Audio stream stopped") diff --git a/rvc_config.json b/rvc_config.json new file mode 100644 index 0000000..460cc69 --- /dev/null +++ b/rvc_config.json @@ -0,0 +1,23 @@ +{ + "pth": "Retrieval-based-Voice-Conversion-WebUI/assets/weights/MikuAI_e210_s6300.pth", + "index": "Retrieval-based-Voice-Conversion-WebUI/assets/indices/added_IVF1811_Flat_nprobe_1_MikuAI_v2.index", + "pitch": 5, + "formant": 0.0, + "index_rate": 0.3, + "filter_radius": 3, + "rms_mix_rate": 0.25, + "protect": 0.33, + "f0method": "rmvpe", + "input_device": "soprano_rvc", + "output_device": null, + "samplerate": 48000, + "channels": 2, + "block_time": 0.25, + "crossfade_time": 0.04, + "extra_time": 2.5, + "n_cpu": 4, + "I_noise_reduce": false, + "O_noise_reduce": false, + "use_pv": false, + "threshold": -60.0 +} diff --git a/unified_soprano_rvc.py b/unified_soprano_rvc.py new file mode 100644 index 0000000..f3a970b --- /dev/null +++ b/unified_soprano_rvc.py @@ -0,0 +1,415 @@ +#!/usr/bin/env python3 +""" +Unified Soprano TTS + RVC Pipeline +Combines soprano_to_virtual_sink.py and headless_rvc.py into a single interface +""" + +import sys +import os +import json +import argparse +import threading +import time +import subprocess +import warnings +import logging +from pathlib import Path +from dataclasses import dataclass, asdict +from typing import Optional +from contextlib import redirect_stdout, redirect_stderr + +# Configure logging - only show INFO and above +logging.basicConfig( + level=logging.INFO, + format='%(levelname)s: %(message)s' +) +# Suppress debug logs from all modules +logging.getLogger().setLevel(logging.INFO) + +# Suppress warnings +warnings.filterwarnings('ignore') +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ['MIOPEN_LOG_LEVEL'] = '1' # Suppress MIOpen warnings + +# Add soprano to path +sys.path.insert(0, str(Path(__file__).parent / "soprano")) + +import numpy as np +import sounddevice as sd +from scipy import signal as scipy_signal +import torch + +# Import soprano +from soprano import SopranoTTS + + +@dataclass +class RVCConfig: + """RVC configuration parameters""" + pth: str + index: str + pitch: int = 0 + formant: float = 0.0 + index_rate: float = 0.75 + filter_radius: int = 3 + rms_mix_rate: float = 0.25 + protect: float = 0.33 + f0method: str = "rmvpe" + input_device: str = "soprano_rvc" + output_device: Optional[str] = None + samplerate: int = 48000 + channels: int = 2 + block_time: float = 0.25 + crossfade_time: float = 0.04 + extra_time: float = 2.5 + n_cpu: int = 4 + I_noise_reduce: bool = False + O_noise_reduce: bool = False + use_pv: bool = True + threshold: float = -60.0 + + @classmethod + def from_file(cls, config_path: str) -> 'RVCConfig': + """Load configuration from JSON file""" + with open(config_path, 'r') as f: + data = json.load(f) + return cls(**data) + + def to_file(self, config_path: str): + """Save configuration to JSON file""" + with open(config_path, 'w') as f: + json.dump(asdict(self), f, indent=2) + + +class UnifiedPipeline: + """Unified Soprano TTS + RVC pipeline""" + + def __init__(self, rvc_config: RVCConfig, virtual_sink_name: str = "soprano_to_rvc"): + self.rvc_config = rvc_config + self.virtual_sink_name = virtual_sink_name + self.soprano = None + self.rvc_process = None + self.rvc_thread = None + self.soprano_stream = None + self.running = False + + # Soprano audio parameters + self.soprano_sample_rate = 32000 + self.virtual_sink_sample_rate = 48000 + + def ensure_virtual_sink(self): + """Ensure PulseAudio virtual sink exists""" + print("Checking virtual sink...") + + # Check if sink exists + result = subprocess.run( + ["pactl", "list", "sinks", "short"], + capture_output=True, + text=True + ) + + if self.virtual_sink_name not in result.stdout: + print(f"Creating virtual sink: {self.virtual_sink_name}") + subprocess.run([ + "pactl", "load-module", "module-null-sink", + f"sink_name={self.virtual_sink_name}", + f"sink_properties=device.description='Soprano_to_RVC_Virtual_Sink'", + f"rate={self.virtual_sink_sample_rate}", + "channels=2" + ]) + time.sleep(0.5) + else: + print(f"✓ Virtual sink '{self.virtual_sink_name}' already exists") + + def initialize_soprano(self): + """Initialize Soprano TTS""" + print("\n" + "="*70) + print("Initializing Soprano TTS...") + print("="*70) + + # Suppress verbose output during initialization + with open(os.devnull, 'w') as devnull: + with redirect_stdout(devnull), redirect_stderr(devnull): + self.soprano = SopranoTTS(device="cuda") + + # Open audio stream to virtual sink + try: + self.soprano_stream = sd.OutputStream( + device=self.virtual_sink_name, + samplerate=self.virtual_sink_sample_rate, + channels=2, + dtype='float32', + blocksize=1024 + ) + self.soprano_stream.start() + print("✓ Soprano TTS initialized successfully") + print(f" Output: {self.virtual_sink_name} ({self.virtual_sink_sample_rate}Hz, stereo)") + except Exception as e: + print(f"✗ Failed to open audio stream: {e}") + raise + + def start_rvc(self): + """Start headless RVC in a separate thread""" + print("\n" + "="*70) + print("Starting RVC Voice Conversion...") + print("="*70) + + def run_rvc(): + # Suppress logging from RVC + import logging + logging.getLogger('faiss').setLevel(logging.ERROR) + logging.getLogger('fairseq').setLevel(logging.ERROR) + + # Import here to avoid conflicts + from headless_rvc import HeadlessRVC, HeadlessRVCConfig + + # Redirect RVC output + with open(os.devnull, 'w') as devnull: + with redirect_stdout(devnull), redirect_stderr(devnull): + # Convert our config to HeadlessRVCConfig + config_dict = { + 'pth_path': self.rvc_config.pth, + 'index_path': self.rvc_config.index, + 'pitch': self.rvc_config.pitch, + 'formant': self.rvc_config.formant, + 'index_rate': self.rvc_config.index_rate, + 'filter_radius': self.rvc_config.filter_radius, + 'rms_mix_rate': self.rvc_config.rms_mix_rate, + 'protect': self.rvc_config.protect, + 'f0method': self.rvc_config.f0method, + 'input_device': self.rvc_config.input_device, + 'output_device': self.rvc_config.output_device, + 'samplerate': self.rvc_config.samplerate, + 'channels': self.rvc_config.channels, + 'block_time': self.rvc_config.block_time, + 'crossfade_time': self.rvc_config.crossfade_time, + 'extra_time': self.rvc_config.extra_time, + 'n_cpu': self.rvc_config.n_cpu, + 'I_noise_reduce': self.rvc_config.I_noise_reduce, + 'O_noise_reduce': self.rvc_config.O_noise_reduce, + 'use_pv': self.rvc_config.use_pv, + 'threshold': self.rvc_config.threshold + } + gui_config = HeadlessRVCConfig(config_dict) + + self.rvc = HeadlessRVC(gui_config) + self.rvc.start() + + # Keep running until stopped + while self.running: + time.sleep(0.1) + + # Suppress stop output too + with open(os.devnull, 'w') as devnull: + with redirect_stdout(devnull), redirect_stderr(devnull): + self.rvc.stop() + + self.rvc_thread = threading.Thread(target=run_rvc, daemon=True) + self.running = True + self.rvc_thread.start() + + # Wait for RVC to initialize + time.sleep(3) # Give it time to load models + print("✓ RVC initialized successfully") + + def stream_audio_chunk(self, audio_chunk): + """Stream an audio chunk to the virtual sink""" + if audio_chunk is None: + return + + # Convert torch tensor to numpy if needed + if torch.is_tensor(audio_chunk): + audio_chunk = audio_chunk.cpu().numpy() + + if len(audio_chunk) == 0: + return + + # Ensure float32 + audio_chunk = audio_chunk.astype(np.float32) + + # Resample from 32kHz to 48kHz + if self.soprano_sample_rate != self.virtual_sink_sample_rate: + num_samples_output = int(len(audio_chunk) * self.virtual_sink_sample_rate / self.soprano_sample_rate) + audio_chunk = scipy_signal.resample(audio_chunk, num_samples_output) + + # Clean audio (handle NaN/inf) + audio_chunk = np.nan_to_num(audio_chunk, nan=0.0, posinf=0.0, neginf=0.0) + audio_chunk = np.clip(audio_chunk, -1.0, 1.0) + + # Convert mono to stereo + if audio_chunk.ndim == 1: + audio_chunk = np.column_stack((audio_chunk, audio_chunk)) + + # Write to stream + try: + self.soprano_stream.write(audio_chunk) + except Exception as e: + print(f"Warning: Failed to write audio chunk: {e}") + + def process_text(self, text: str): + """Process text through TTS and stream to virtual sink""" + if not text.strip(): + return + + print(f"\n🎤 Processing: {text}", flush=True) + + # Generate and stream audio (suppress Soprano's verbose output) + with open(os.devnull, 'w') as devnull: + with redirect_stdout(devnull), redirect_stderr(devnull): + for audio_chunk in self.soprano.infer_stream(text): + if audio_chunk is not None: + self.stream_audio_chunk(audio_chunk) + + # Small silence at the end + silence = np.zeros(int(0.1 * self.virtual_sink_sample_rate), dtype=np.float32) + self.stream_audio_chunk(silence) + print("✓ Done\n", flush=True) + + def run(self): + """Run the unified pipeline""" + try: + # Setup + self.ensure_virtual_sink() + self.initialize_soprano() + self.start_rvc() + + print("\n" + "="*70) + print("UNIFIED SOPRANO TTS + RVC PIPELINE") + print("="*70) + print(f"\n✓ Ready! Voice conversion active (pitch: {self.rvc_config.pitch:+d} semitones)") + print("\nCommands:") + print(" - Type text and press Enter to generate speech") + print(" - Type 'quit' or 'exit' to stop") + print(" - Press Ctrl+C to stop") + print("="*70 + "\n") + + # Interactive loop + while self.running: + try: + text = input("💬 > ").strip() + + if text.lower() in ['quit', 'exit', 'q']: + break + + if text: + self.process_text(text) + + except EOFError: + break + except KeyboardInterrupt: + break + + finally: + self.cleanup() + + def cleanup(self): + """Clean up resources""" + print("\n\n⏹️ Stopping...") + self.running = False + + if self.soprano_stream: + self.soprano_stream.stop() + self.soprano_stream.close() + + if self.rvc_thread and self.rvc_thread.is_alive(): + self.rvc_thread.join(timeout=2) + + print("✓ Stopped") + print("👋 Goodbye!\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Unified Soprano TTS + RVC Pipeline", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run with command-line arguments + python unified_soprano_rvc.py --pth model.pth --index model.index --pitch 0 + + # Load from config file + python unified_soprano_rvc.py --config rvc_config.json + + # Save current config + python unified_soprano_rvc.py --pth model.pth --index model.index --save-config my_config.json + """ + ) + + # Config file options + parser.add_argument('--config', type=str, help='Load RVC configuration from JSON file') + parser.add_argument('--save-config', type=str, help='Save configuration to JSON file and exit') + + # RVC parameters + parser.add_argument('--pth', type=str, help='Path to RVC model (.pth file)') + parser.add_argument('--index', type=str, help='Path to index file') + parser.add_argument('--pitch', type=int, default=0, help='Pitch shift in semitones (default: 0)') + parser.add_argument('--formant', type=float, default=0.0, help='Formant shift (default: 0.0)') + parser.add_argument('--index-rate', type=float, default=0.75, help='Index rate (default: 0.75)') + parser.add_argument('--filter-radius', type=int, default=3, help='Filter radius (default: 3)') + parser.add_argument('--rms-mix-rate', type=float, default=0.25, help='RMS mix rate (default: 0.25)') + parser.add_argument('--protect', type=float, default=0.33, help='Protect voiceless consonants (default: 0.33)') + parser.add_argument('--f0method', type=str, default='rmvpe', + choices=['rmvpe', 'harvest', 'crepe', 'fcpe'], + help='F0 extraction method (default: rmvpe)') + + # Audio device settings + parser.add_argument('--input-device', type=str, default='soprano_rvc', + help='Input audio device for RVC (default: soprano_rvc)') + parser.add_argument('--output-device', type=str, help='Output audio device (default: system default)') + parser.add_argument('--samplerate', type=int, default=48000, help='Sample rate (default: 48000)') + + # Advanced options + parser.add_argument('--n-cpu', type=int, default=4, help='Number of CPU cores for F0 extraction (default: 4)') + parser.add_argument('--threshold', type=float, default=-60.0, help='Silence threshold in dB (default: -60)') + parser.add_argument('--I-noise-reduce', action='store_true', help='Enable input noise reduction') + parser.add_argument('--O-noise-reduce', action='store_true', help='Enable output noise reduction') + parser.add_argument('--no-use-pv', action='store_true', help='Disable phase vocoder') + + # Virtual sink name + parser.add_argument('--virtual-sink', type=str, default='soprano_to_rvc', + help='Name of virtual sink (default: soprano_to_rvc)') + + args = parser.parse_args() + + # Load or create config + if args.config: + print(f"Loading configuration from: {args.config}") + rvc_config = RVCConfig.from_file(args.config) + else: + # Validate required arguments + if not args.pth or not args.index: + parser.error("--pth and --index are required (or use --config)") + + rvc_config = RVCConfig( + pth=args.pth, + index=args.index, + pitch=args.pitch, + formant=args.formant, + index_rate=args.index_rate, + filter_radius=args.filter_radius, + rms_mix_rate=args.rms_mix_rate, + protect=args.protect, + f0method=args.f0method, + input_device=args.input_device, + output_device=args.output_device, + samplerate=args.samplerate, + n_cpu=args.n_cpu, + I_noise_reduce=args.I_noise_reduce, + O_noise_reduce=args.O_noise_reduce, + use_pv=not args.no_use_pv, + threshold=args.threshold + ) + + # Save config if requested + if args.save_config: + rvc_config.to_file(args.save_config) + print(f"✓ Configuration saved to: {args.save_config}") + return + + # Run pipeline + pipeline = UnifiedPipeline(rvc_config, virtual_sink_name=args.virtual_sink) + pipeline.run() + + +if __name__ == "__main__": + main()