From 942ca362526ba8ea5df090b5dc97931b6c4e0196 Mon Sep 17 00:00:00 2001 From: Koko210 Date: Mon, 12 Jan 2026 22:55:21 +0200 Subject: [PATCH] Working with GUI, auto loopback creation, soprano streaming --- .gitignore | 2 + Retrieval-based-Voice-Conversion-WebUI | 1 + cleanup_virtual_sinks.sh | 29 + constraints.txt | 24 + gui_v1.py.backup | 1070 ++++++++++++++++++++++++ launch_soprano_rvc.sh | 260 ++++++ python-version.txt | 1 + requirements.lock.txt | 159 ++++ requirements.txt | 86 ++ setup_alsa_bridge.sh | 52 ++ soprano | 1 + soprano_to_virtual_sink.py | 299 +++++++ 12 files changed, 1984 insertions(+) create mode 100644 .gitignore create mode 160000 Retrieval-based-Voice-Conversion-WebUI create mode 100755 cleanup_virtual_sinks.sh create mode 100644 constraints.txt create mode 100644 gui_v1.py.backup create mode 100755 launch_soprano_rvc.sh create mode 100644 python-version.txt create mode 100644 requirements.lock.txt create mode 100644 requirements.txt create mode 100755 setup_alsa_bridge.sh create mode 160000 soprano create mode 100755 soprano_to_virtual_sink.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66f6169 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.venv +*.pth diff --git a/Retrieval-based-Voice-Conversion-WebUI b/Retrieval-based-Voice-Conversion-WebUI new file mode 160000 index 0000000..7ef1986 --- /dev/null +++ b/Retrieval-based-Voice-Conversion-WebUI @@ -0,0 +1 @@ +Subproject commit 7ef19867780cf703841ebafb565a4e47d1ea86ff diff --git a/cleanup_virtual_sinks.sh b/cleanup_virtual_sinks.sh new file mode 100755 index 0000000..b71725b --- /dev/null +++ b/cleanup_virtual_sinks.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Cleanup script to remove any leftover virtual sinks +# Run this if you encounter issues with the virtual sink + +echo "🧹 Cleaning up virtual audio sinks..." +echo "" + +# Find and remove soprano virtual sinks +MODULES=$(pactl list modules short | grep -E "soprano|rvc" | awk '{print $1}') + +if [ -z "$MODULES" ]; then + echo "✓ No virtual sinks found. Nothing to clean up." +else + echo "Found virtual sink modules to remove:" + pactl list modules short | grep -E "soprano|rvc" + echo "" + + for MODULE in $MODULES; do + echo "Removing module $MODULE..." + pactl unload-module "$MODULE" + done + + echo "" + echo "✓ Cleanup complete!" +fi + +echo "" +echo "Current audio sinks:" +pactl list sinks short diff --git a/constraints.txt b/constraints.txt new file mode 100644 index 0000000..2b46ce2 --- /dev/null +++ b/constraints.txt @@ -0,0 +1,24 @@ +# ========================================================== +# HARD CONSTRAINTS — DO NOT UPGRADE CASUALLY +# ========================================================== + +python_version == "3.10.19" + +# Torch / ROCm ABI lock +torch == 2.5.1+rocm6.2 +torchaudio == 2.5.1+rocm6.2 +torchvision == 0.20.1+rocm6.2 +pytorch-triton-rocm == 3.1.0 + +# NumPy / Numba compatibility +numpy < 1.24 +numba == 0.56.4 +llvmlite == 0.39.0 + +# RVC core +fairseq == 0.12.2 +faiss-cpu == 1.7.3 +pyworld < 0.4 + +# Gradio pin (RVC WebUI tested) +gradio == 3.48.0 diff --git a/gui_v1.py.backup b/gui_v1.py.backup new file mode 100644 index 0000000..c5e7179 --- /dev/null +++ b/gui_v1.py.backup @@ -0,0 +1,1070 @@ +import os +import sys +from dotenv import load_dotenv +import shutil + +load_dotenv() + +os.environ["OMP_NUM_THREADS"] = "4" +if sys.platform == "darwin": + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +now_dir = os.getcwd() +sys.path.append(now_dir) +import multiprocessing + +flag_vc = False + + +def printt(strr, *args): + if len(args) == 0: + print(strr) + else: + print(strr % args) + + +def phase_vocoder(a, b, fade_out, fade_in): + window = torch.sqrt(fade_out * fade_in) + fa = torch.fft.rfft(a * window) + fb = torch.fft.rfft(b * window) + absab = torch.abs(fa) + torch.abs(fb) + n = a.shape[0] + if n % 2 == 0: + absab[1:-1] *= 2 + else: + absab[1:] *= 2 + phia = torch.angle(fa) + phib = torch.angle(fb) + deltaphase = phib - phia + deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5) + w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase + t = torch.arange(n).unsqueeze(-1).to(a) / n + result = ( + a * (fade_out**2) + + b * (fade_in**2) + + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n + ) + return result + + +class Harvest(multiprocessing.Process): + def __init__(self, inp_q, opt_q): + multiprocessing.Process.__init__(self) + self.inp_q = inp_q + self.opt_q = opt_q + + def run(self): + import numpy as np + import pyworld + + while 1: + idx, x, res_f0, n_cpu, ts = self.inp_q.get() + f0, t = pyworld.harvest( + x.astype(np.double), + fs=16000, + f0_ceil=1100, + f0_floor=50, + frame_period=10, + ) + res_f0[idx] = f0 + if len(res_f0.keys()) >= n_cpu: + self.opt_q.put(ts) + + +if __name__ == "__main__": + import json + import multiprocessing + import re + import threading + import time + import traceback + from multiprocessing import Queue, cpu_count + from queue import Empty + + import librosa + from tools.torchgate import TorchGate + import numpy as np + import FreeSimpleGUI as sg + import sounddevice as sd + import torch + import torch.nn.functional as F + import torchaudio.transforms as tat + + from infer.lib import rtrvc as rvc_for_realtime + from i18n.i18n import I18nAuto + from configs.config import Config + + i18n = I18nAuto() + + # device = rvc_for_realtime.config.device + # device = torch.device( + # "cuda" + # if torch.cuda.is_available() + # else ("mps" if torch.backends.mps.is_available() else "cpu") + # ) + current_dir = os.getcwd() + inp_q = Queue() + opt_q = Queue() + n_cpu = min(cpu_count(), 8) + for _ in range(n_cpu): + p = Harvest(inp_q, opt_q) + p.daemon = True + p.start() + + class GUIConfig: + def __init__(self) -> None: + self.pth_path: str = "" + self.index_path: str = "" + self.pitch: int = 0 + self.formant=0.0 + self.sr_type: str = "sr_model" + self.block_time: float = 0.25 # s + self.threhold: int = -60 + self.crossfade_time: float = 0.05 + self.extra_time: float = 2.5 + self.I_noise_reduce: bool = False + self.O_noise_reduce: bool = False + self.use_pv: bool = False + self.rms_mix_rate: float = 0.0 + self.index_rate: float = 0.0 + self.n_cpu: int = min(n_cpu, 4) + self.f0method: str = "fcpe" + self.sg_hostapi: str = "" + self.wasapi_exclusive: bool = False + self.sg_input_device: str = "" + self.sg_output_device: str = "" + + class GUI: + def __init__(self) -> None: + self.gui_config = GUIConfig() + self.config = Config() + self.function = "vc" + self.delay_time = 0 + self.hostapis = None + self.input_devices = None + self.output_devices = None + self.input_devices_indices = None + self.output_devices_indices = None + self.stream = None + self.update_devices() + self.launcher() + + def load(self): + try: + if not os.path.exists("configs/inuse/config.json"): + shutil.copy("configs/config.json", "configs/inuse/config.json") + with open("configs/inuse/config.json", "r") as j: + data = json.load(j) + data["sr_model"] = data["sr_type"] == "sr_model" + data["sr_device"] = data["sr_type"] == "sr_device" + data["pm"] = data["f0method"] == "pm" + data["harvest"] = data["f0method"] == "harvest" + data["crepe"] = data["f0method"] == "crepe" + data["rmvpe"] = data["f0method"] == "rmvpe" + data["fcpe"] = data["f0method"] == "fcpe" + if data["sg_hostapi"] in self.hostapis: + self.update_devices(hostapi_name=data["sg_hostapi"]) + if ( + data["sg_input_device"] not in self.input_devices + or data["sg_output_device"] not in self.output_devices + ): + self.update_devices() + data["sg_hostapi"] = self.hostapis[0] + data["sg_input_device"] = self.input_devices[ + self.input_devices_indices.index(sd.default.device[0]) + ] + data["sg_output_device"] = self.output_devices[ + self.output_devices_indices.index(sd.default.device[1]) + ] + else: + data["sg_hostapi"] = self.hostapis[0] + data["sg_input_device"] = self.input_devices[ + self.input_devices_indices.index(sd.default.device[0]) + ] + data["sg_output_device"] = self.output_devices[ + self.output_devices_indices.index(sd.default.device[1]) + ] + except: + with open("configs/inuse/config.json", "w") as j: + data = { + "pth_path": "", + "index_path": "", + "sg_hostapi": self.hostapis[0], + "sg_wasapi_exclusive": False, + "sg_input_device": self.input_devices[ + self.input_devices_indices.index(sd.default.device[0]) + ], + "sg_output_device": self.output_devices[ + self.output_devices_indices.index(sd.default.device[1]) + ], + "sr_type": "sr_model", + "threhold": -60, + "pitch": 0, + "formant": 0.0, + "index_rate": 0, + "rms_mix_rate": 0, + "block_time": 0.25, + "crossfade_length": 0.05, + "extra_time": 2.5, + "n_cpu": 4, + "f0method": "rmvpe", + "use_jit": False, + "use_pv": False, + } + data["sr_model"] = data["sr_type"] == "sr_model" + data["sr_device"] = data["sr_type"] == "sr_device" + data["pm"] = data["f0method"] == "pm" + data["harvest"] = data["f0method"] == "harvest" + data["crepe"] = data["f0method"] == "crepe" + data["rmvpe"] = data["f0method"] == "rmvpe" + data["fcpe"] = data["f0method"] == "fcpe" + return data + + def launcher(self): + data = self.load() + self.config.use_jit = False # data.get("use_jit", self.config.use_jit) + sg.theme("LightBlue3") + layout = [ + [ + sg.Frame( + title=i18n("加载模型"), + layout=[ + [ + sg.Input( + default_text=data.get("pth_path", ""), + key="pth_path", + ), + sg.FileBrowse( + i18n("选择.pth文件"), + initial_folder=os.path.join( + os.getcwd(), "assets/weights" + ), + file_types=((". pth"),), + ), + ], + [ + sg.Input( + default_text=data.get("index_path", ""), + key="index_path", + ), + sg.FileBrowse( + i18n("选择.index文件"), + initial_folder=os.path.join(os.getcwd(), "logs"), + file_types=((". index"),), + ), + ], + ], + ) + ], + [ + sg.Frame( + layout=[ + [ + sg.Text(i18n("设备类型")), + sg.Combo( + self.hostapis, + key="sg_hostapi", + default_value=data.get("sg_hostapi", ""), + enable_events=True, + size=(20, 1), + ), + sg.Checkbox( + i18n("独占 WASAPI 设备"), + key="sg_wasapi_exclusive", + default=data.get("sg_wasapi_exclusive", False), + enable_events=True, + ), + ], + [ + sg.Text(i18n("输入设备")), + sg.Combo( + self.input_devices, + key="sg_input_device", + default_value=data.get("sg_input_device", ""), + enable_events=True, + size=(45, 1), + ), + ], + [ + sg.Text(i18n("输出设备")), + sg.Combo( + self.output_devices, + key="sg_output_device", + default_value=data.get("sg_output_device", ""), + enable_events=True, + size=(45, 1), + ), + ], + [ + sg.Button(i18n("重载设备列表"), key="reload_devices"), + sg.Radio( + i18n("使用模型采样率"), + "sr_type", + key="sr_model", + default=data.get("sr_model", True), + enable_events=True, + ), + sg.Radio( + i18n("使用设备采样率"), + "sr_type", + key="sr_device", + default=data.get("sr_device", False), + enable_events=True, + ), + sg.Text(i18n("采样率:")), + sg.Text("", key="sr_stream"), + ], + ], + title=i18n("音频设备"), + ) + ], + [ + sg.Frame( + layout=[ + [ + sg.Text(i18n("响应阈值")), + sg.Slider( + range=(-60, 0), + key="threhold", + resolution=1, + orientation="h", + default_value=data.get("threhold", -60), + enable_events=True, + ), + ], + [ + sg.Text(i18n("音调设置")), + sg.Slider( + range=(-16, 16), + key="pitch", + resolution=1, + orientation="h", + default_value=data.get("pitch", 0), + enable_events=True, + ), + ], + [ + sg.Text(i18n("性别因子/声线粗细")), + sg.Slider( + range=(-2, 2), + key="formant", + resolution=0.05, + orientation="h", + default_value=data.get("formant", 0.0), + enable_events=True, + ), + ], + [ + sg.Text(i18n("Index Rate")), + sg.Slider( + range=(0.0, 1.0), + key="index_rate", + resolution=0.01, + orientation="h", + default_value=data.get("index_rate", 0), + enable_events=True, + ), + ], + [ + sg.Text(i18n("响度因子")), + sg.Slider( + range=(0.0, 1.0), + key="rms_mix_rate", + resolution=0.01, + orientation="h", + default_value=data.get("rms_mix_rate", 0), + enable_events=True, + ), + ], + [ + sg.Text(i18n("音高算法")), + sg.Radio( + "pm", + "f0method", + key="pm", + default=data.get("pm", False), + enable_events=True, + ), + sg.Radio( + "harvest", + "f0method", + key="harvest", + default=data.get("harvest", False), + enable_events=True, + ), + sg.Radio( + "crepe", + "f0method", + key="crepe", + default=data.get("crepe", False), + enable_events=True, + ), + sg.Radio( + "rmvpe", + "f0method", + key="rmvpe", + default=data.get("rmvpe", False), + enable_events=True, + ), + sg.Radio( + "fcpe", + "f0method", + key="fcpe", + default=data.get("fcpe", True), + enable_events=True, + ), + ], + ], + title=i18n("常规设置"), + ), + sg.Frame( + layout=[ + [ + sg.Text(i18n("采样长度")), + sg.Slider( + range=(0.02, 1.5), + key="block_time", + resolution=0.01, + orientation="h", + default_value=data.get("block_time", 0.25), + enable_events=True, + ), + ], + # [ + # sg.Text("设备延迟"), + # sg.Slider( + # range=(0, 1), + # key="device_latency", + # resolution=0.001, + # orientation="h", + # default_value=data.get("device_latency", 0.1), + # enable_events=True, + # ), + # ], + [ + sg.Text(i18n("harvest进程数")), + sg.Slider( + range=(1, n_cpu), + key="n_cpu", + resolution=1, + orientation="h", + default_value=data.get( + "n_cpu", min(self.gui_config.n_cpu, n_cpu) + ), + enable_events=True, + ), + ], + [ + sg.Text(i18n("淡入淡出长度")), + sg.Slider( + range=(0.01, 0.15), + key="crossfade_length", + resolution=0.01, + orientation="h", + default_value=data.get("crossfade_length", 0.05), + enable_events=True, + ), + ], + [ + sg.Text(i18n("额外推理时长")), + sg.Slider( + range=(0.05, 5.00), + key="extra_time", + resolution=0.01, + orientation="h", + default_value=data.get("extra_time", 2.5), + enable_events=True, + ), + ], + [ + sg.Checkbox( + i18n("输入降噪"), + key="I_noise_reduce", + enable_events=True, + ), + sg.Checkbox( + i18n("输出降噪"), + key="O_noise_reduce", + enable_events=True, + ), + sg.Checkbox( + i18n("启用相位声码器"), + key="use_pv", + default=data.get("use_pv", False), + enable_events=True, + ), + # sg.Checkbox( + # "JIT加速", + # default=self.config.use_jit, + # key="use_jit", + # enable_events=False, + # ), + ], + # [sg.Text("注:首次使用JIT加速时,会出现卡顿,\n 并伴随一些噪音,但这是正常现象!")], + ], + title=i18n("性能设置"), + ), + ], + [ + sg.Button(i18n("开始音频转换"), key="start_vc"), + sg.Button(i18n("停止音频转换"), key="stop_vc"), + sg.Radio( + i18n("输入监听"), + "function", + key="im", + default=False, + enable_events=True, + ), + sg.Radio( + i18n("输出变声"), + "function", + key="vc", + default=True, + enable_events=True, + ), + sg.Text(i18n("算法延迟(ms):")), + sg.Text("0", key="delay_time"), + sg.Text(i18n("推理时间(ms):")), + sg.Text("0", key="infer_time"), + ], + ] + self.window = sg.Window("RVC - GUI", layout=layout, finalize=True) + self.event_handler() + + def event_handler(self): + global flag_vc + while True: + event, values = self.window.read() + if event == sg.WINDOW_CLOSED: + self.stop_stream() + exit() + if event == "reload_devices" or event == "sg_hostapi": + self.gui_config.sg_hostapi = values["sg_hostapi"] + self.update_devices(hostapi_name=values["sg_hostapi"]) + if self.gui_config.sg_hostapi not in self.hostapis: + self.gui_config.sg_hostapi = self.hostapis[0] + self.window["sg_hostapi"].Update(values=self.hostapis) + self.window["sg_hostapi"].Update(value=self.gui_config.sg_hostapi) + if ( + self.gui_config.sg_input_device not in self.input_devices + and len(self.input_devices) > 0 + ): + self.gui_config.sg_input_device = self.input_devices[0] + self.window["sg_input_device"].Update(values=self.input_devices) + self.window["sg_input_device"].Update( + value=self.gui_config.sg_input_device + ) + if self.gui_config.sg_output_device not in self.output_devices: + self.gui_config.sg_output_device = self.output_devices[0] + self.window["sg_output_device"].Update(values=self.output_devices) + self.window["sg_output_device"].Update( + value=self.gui_config.sg_output_device + ) + if event == "start_vc" and not flag_vc: + if self.set_values(values) == True: + printt("cuda_is_available: %s", torch.cuda.is_available()) + self.start_vc() + settings = { + "pth_path": values["pth_path"], + "index_path": values["index_path"], + "sg_hostapi": values["sg_hostapi"], + "sg_wasapi_exclusive": values["sg_wasapi_exclusive"], + "sg_input_device": values["sg_input_device"], + "sg_output_device": values["sg_output_device"], + "sr_type": ["sr_model", "sr_device"][ + [ + values["sr_model"], + values["sr_device"], + ].index(True) + ], + "threhold": values["threhold"], + "pitch": values["pitch"], + "rms_mix_rate": values["rms_mix_rate"], + "index_rate": values["index_rate"], + # "device_latency": values["device_latency"], + "block_time": values["block_time"], + "crossfade_length": values["crossfade_length"], + "extra_time": values["extra_time"], + "n_cpu": values["n_cpu"], + # "use_jit": values["use_jit"], + "use_jit": False, + "use_pv": values["use_pv"], + "f0method": ["pm", "harvest", "crepe", "rmvpe", "fcpe"][ + [ + values["pm"], + values["harvest"], + values["crepe"], + values["rmvpe"], + values["fcpe"], + ].index(True) + ], + } + with open("configs/inuse/config.json", "w") as j: + json.dump(settings, j) + if self.stream is not None: + self.delay_time = ( + self.stream.latency[-1] + + values["block_time"] + + values["crossfade_length"] + + 0.01 + ) + if values["I_noise_reduce"]: + self.delay_time += min(values["crossfade_length"], 0.04) + self.window["sr_stream"].update(self.gui_config.samplerate) + self.window["delay_time"].update( + int(np.round(self.delay_time * 1000)) + ) + # Parameter hot update + if event == "threhold": + self.gui_config.threhold = values["threhold"] + elif event == "pitch": + self.gui_config.pitch = values["pitch"] + if hasattr(self, "rvc"): + self.rvc.change_key(values["pitch"]) + elif event == "formant": + self.gui_config.formant = values["formant"] + if hasattr(self, "rvc"): + self.rvc.change_formant(values["formant"]) + elif event == "index_rate": + self.gui_config.index_rate = values["index_rate"] + if hasattr(self, "rvc"): + self.rvc.change_index_rate(values["index_rate"]) + elif event == "rms_mix_rate": + self.gui_config.rms_mix_rate = values["rms_mix_rate"] + elif event in ["pm", "harvest", "crepe", "rmvpe", "fcpe"]: + self.gui_config.f0method = event + elif event == "I_noise_reduce": + self.gui_config.I_noise_reduce = values["I_noise_reduce"] + if self.stream is not None: + self.delay_time += ( + 1 if values["I_noise_reduce"] else -1 + ) * min(values["crossfade_length"], 0.04) + self.window["delay_time"].update( + int(np.round(self.delay_time * 1000)) + ) + elif event == "O_noise_reduce": + self.gui_config.O_noise_reduce = values["O_noise_reduce"] + elif event == "use_pv": + self.gui_config.use_pv = values["use_pv"] + elif event in ["vc", "im"]: + self.function = event + elif event == "stop_vc" or event != "start_vc": + # Other parameters do not support hot update + self.stop_stream() + + def set_values(self, values): + if len(values["pth_path"].strip()) == 0: + sg.popup(i18n("请选择pth文件")) + return False + if len(values["index_path"].strip()) == 0: + sg.popup(i18n("请选择index文件")) + return False + pattern = re.compile("[^\x00-\x7F]+") + if pattern.findall(values["pth_path"]): + sg.popup(i18n("pth文件路径不可包含中文")) + return False + if pattern.findall(values["index_path"]): + sg.popup(i18n("index文件路径不可包含中文")) + return False + self.set_devices(values["sg_input_device"], values["sg_output_device"]) + self.config.use_jit = False # values["use_jit"] + # self.device_latency = values["device_latency"] + self.gui_config.sg_hostapi = values["sg_hostapi"] + self.gui_config.sg_wasapi_exclusive = values["sg_wasapi_exclusive"] + self.gui_config.sg_input_device = values["sg_input_device"] + self.gui_config.sg_output_device = values["sg_output_device"] + self.gui_config.pth_path = values["pth_path"] + self.gui_config.index_path = values["index_path"] + self.gui_config.sr_type = ["sr_model", "sr_device"][ + [ + values["sr_model"], + values["sr_device"], + ].index(True) + ] + self.gui_config.threhold = values["threhold"] + self.gui_config.pitch = values["pitch"] + self.gui_config.formant = values["formant"] + self.gui_config.block_time = values["block_time"] + self.gui_config.crossfade_time = values["crossfade_length"] + self.gui_config.extra_time = values["extra_time"] + self.gui_config.I_noise_reduce = values["I_noise_reduce"] + self.gui_config.O_noise_reduce = values["O_noise_reduce"] + self.gui_config.use_pv = values["use_pv"] + self.gui_config.rms_mix_rate = values["rms_mix_rate"] + self.gui_config.index_rate = values["index_rate"] + self.gui_config.n_cpu = values["n_cpu"] + self.gui_config.f0method = ["pm", "harvest", "crepe", "rmvpe", "fcpe"][ + [ + values["pm"], + values["harvest"], + values["crepe"], + values["rmvpe"], + values["fcpe"], + ].index(True) + ] + return True + + def start_vc(self): + torch.cuda.empty_cache() + self.rvc = rvc_for_realtime.RVC( + self.gui_config.pitch, + self.gui_config.formant, + self.gui_config.pth_path, + self.gui_config.index_path, + self.gui_config.index_rate, + self.gui_config.n_cpu, + inp_q, + opt_q, + self.config, + self.rvc if hasattr(self, "rvc") else None, + ) + self.gui_config.samplerate = ( + self.rvc.tgt_sr + if self.gui_config.sr_type == "sr_model" + else self.get_device_samplerate() + ) + self.gui_config.channels = self.get_device_channels() + self.zc = self.gui_config.samplerate // 100 + self.block_frame = ( + int( + np.round( + self.gui_config.block_time + * self.gui_config.samplerate + / self.zc + ) + ) + * self.zc + ) + self.block_frame_16k = 160 * self.block_frame // self.zc + self.crossfade_frame = ( + int( + np.round( + self.gui_config.crossfade_time + * self.gui_config.samplerate + / self.zc + ) + ) + * self.zc + ) + self.sola_buffer_frame = min(self.crossfade_frame, 4 * self.zc) + self.sola_search_frame = self.zc + self.extra_frame = ( + int( + np.round( + self.gui_config.extra_time + * self.gui_config.samplerate + / self.zc + ) + ) + * self.zc + ) + self.input_wav: torch.Tensor = torch.zeros( + self.extra_frame + + self.crossfade_frame + + self.sola_search_frame + + self.block_frame, + device=self.config.device, + dtype=torch.float32, + ) + self.input_wav_denoise: torch.Tensor = self.input_wav.clone() + self.input_wav_res: torch.Tensor = torch.zeros( + 160 * self.input_wav.shape[0] // self.zc, + device=self.config.device, + dtype=torch.float32, + ) + self.rms_buffer: np.ndarray = np.zeros(4 * self.zc, dtype="float32") + self.sola_buffer: torch.Tensor = torch.zeros( + self.sola_buffer_frame, device=self.config.device, dtype=torch.float32 + ) + self.nr_buffer: torch.Tensor = self.sola_buffer.clone() + self.output_buffer: torch.Tensor = self.input_wav.clone() + self.skip_head = self.extra_frame // self.zc + self.return_length = ( + self.block_frame + self.sola_buffer_frame + self.sola_search_frame + ) // self.zc + self.fade_in_window: torch.Tensor = ( + torch.sin( + 0.5 + * np.pi + * torch.linspace( + 0.0, + 1.0, + steps=self.sola_buffer_frame, + device=self.config.device, + dtype=torch.float32, + ) + ) + ** 2 + ) + self.fade_out_window: torch.Tensor = 1 - self.fade_in_window + self.resampler = tat.Resample( + orig_freq=self.gui_config.samplerate, + new_freq=16000, + dtype=torch.float32, + ).to(self.config.device) + if self.rvc.tgt_sr != self.gui_config.samplerate: + self.resampler2 = tat.Resample( + orig_freq=self.rvc.tgt_sr, + new_freq=self.gui_config.samplerate, + dtype=torch.float32, + ).to(self.config.device) + else: + self.resampler2 = None + self.tg = TorchGate( + sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 + ).to(self.config.device) + self.start_stream() + + def start_stream(self): + global flag_vc + if not flag_vc: + flag_vc = True + if ( + "WASAPI" in self.gui_config.sg_hostapi + and self.gui_config.sg_wasapi_exclusive + ): + extra_settings = sd.WasapiSettings(exclusive=True) + else: + extra_settings = None + self.stream = sd.Stream( + callback=self.audio_callback, + blocksize=self.block_frame, + samplerate=self.gui_config.samplerate, + channels=self.gui_config.channels, + dtype="float32", + extra_settings=extra_settings, + ) + self.stream.start() + + def stop_stream(self): + global flag_vc + if flag_vc: + flag_vc = False + if self.stream is not None: + self.stream.abort() + self.stream.close() + self.stream = None + + def audio_callback( + self, indata: np.ndarray, outdata: np.ndarray, frames, times, status + ): + """ + 音频处理 + """ + global flag_vc + start_time = time.perf_counter() + indata = librosa.to_mono(indata.T) + if self.gui_config.threhold > -60: + indata = np.append(self.rms_buffer, indata) + rms = librosa.feature.rms( + y=indata, frame_length=4 * self.zc, hop_length=self.zc + )[:, 2:] + self.rms_buffer[:] = indata[-4 * self.zc :] + indata = indata[2 * self.zc - self.zc // 2 :] + db_threhold = ( + librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold + ) + for i in range(db_threhold.shape[0]): + if db_threhold[i]: + indata[i * self.zc : (i + 1) * self.zc] = 0 + indata = indata[self.zc // 2 :] + self.input_wav[: -self.block_frame] = self.input_wav[ + self.block_frame : + ].clone() + self.input_wav[-indata.shape[0] :] = torch.from_numpy(indata).to( + self.config.device + ) + self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[ + self.block_frame_16k : + ].clone() + # input noise reduction and resampling + if self.gui_config.I_noise_reduce: + self.input_wav_denoise[: -self.block_frame] = self.input_wav_denoise[ + self.block_frame : + ].clone() + input_wav = self.input_wav[-self.sola_buffer_frame - self.block_frame :] + input_wav = self.tg( + input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) + ).squeeze(0) + input_wav[: self.sola_buffer_frame] *= self.fade_in_window + input_wav[: self.sola_buffer_frame] += ( + self.nr_buffer * self.fade_out_window + ) + self.input_wav_denoise[-self.block_frame :] = input_wav[ + : self.block_frame + ] + self.nr_buffer[:] = input_wav[self.block_frame :] + self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( + self.input_wav_denoise[-self.block_frame - 2 * self.zc :] + )[160:] + else: + self.input_wav_res[-160 * (indata.shape[0] // self.zc + 1) :] = ( + self.resampler(self.input_wav[-indata.shape[0] - 2 * self.zc :])[ + 160: + ] + ) + # infer + if self.function == "vc": + infer_wav = self.rvc.infer( + self.input_wav_res, + self.block_frame_16k, + self.skip_head, + self.return_length, + self.gui_config.f0method, + ) + if self.resampler2 is not None: + infer_wav = self.resampler2(infer_wav) + elif self.gui_config.I_noise_reduce: + infer_wav = self.input_wav_denoise[self.extra_frame :].clone() + else: + infer_wav = self.input_wav[self.extra_frame :].clone() + # output noise reduction + if self.gui_config.O_noise_reduce and self.function == "vc": + self.output_buffer[: -self.block_frame] = self.output_buffer[ + self.block_frame : + ].clone() + self.output_buffer[-self.block_frame :] = infer_wav[-self.block_frame :] + infer_wav = self.tg( + infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0) + ).squeeze(0) + # volume envelop mixing + if self.gui_config.rms_mix_rate < 1 and self.function == "vc": + if self.gui_config.I_noise_reduce: + input_wav = self.input_wav_denoise[self.extra_frame :] + else: + input_wav = self.input_wav[self.extra_frame :] + rms1 = librosa.feature.rms( + y=input_wav[: infer_wav.shape[0]].cpu().numpy(), + frame_length=4 * self.zc, + hop_length=self.zc, + ) + rms1 = torch.from_numpy(rms1).to(self.config.device) + rms1 = F.interpolate( + rms1.unsqueeze(0), + size=infer_wav.shape[0] + 1, + mode="linear", + align_corners=True, + )[0, 0, :-1] + rms2 = librosa.feature.rms( + y=infer_wav[:].cpu().numpy(), + frame_length=4 * self.zc, + hop_length=self.zc, + ) + rms2 = torch.from_numpy(rms2).to(self.config.device) + rms2 = F.interpolate( + rms2.unsqueeze(0), + size=infer_wav.shape[0] + 1, + mode="linear", + align_corners=True, + )[0, 0, :-1] + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3) + infer_wav *= torch.pow( + rms1 / rms2, torch.tensor(1 - self.gui_config.rms_mix_rate) + ) + # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC + conv_input = infer_wav[ + None, None, : self.sola_buffer_frame + self.sola_search_frame + ] + cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) + cor_den = torch.sqrt( + F.conv1d( + conv_input**2, + torch.ones(1, 1, self.sola_buffer_frame, device=self.config.device), + ) + + 1e-8 + ) + if sys.platform == "darwin": + _, sola_offset = torch.max(cor_nom[0, 0] / cor_den[0, 0]) + sola_offset = sola_offset.item() + else: + sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) + printt("sola_offset = %d", int(sola_offset)) + infer_wav = infer_wav[sola_offset:] + if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv: + infer_wav[: self.sola_buffer_frame] *= self.fade_in_window + infer_wav[: self.sola_buffer_frame] += ( + self.sola_buffer * self.fade_out_window + ) + else: + infer_wav[: self.sola_buffer_frame] = phase_vocoder( + self.sola_buffer, + infer_wav[: self.sola_buffer_frame], + self.fade_out_window, + self.fade_in_window, + ) + self.sola_buffer[:] = infer_wav[ + self.block_frame : self.block_frame + self.sola_buffer_frame + ] + outdata[:] = ( + infer_wav[: self.block_frame] + .repeat(self.gui_config.channels, 1) + .t() + .cpu() + .numpy() + ) + total_time = time.perf_counter() - start_time + if flag_vc: + self.window["infer_time"].update(int(total_time * 1000)) + printt("Infer time: %.2f", total_time) + + def update_devices(self, hostapi_name=None): + """获取设备列表""" + global flag_vc + flag_vc = False + sd._terminate() + sd._initialize() + devices = sd.query_devices() + hostapis = sd.query_hostapis() + for hostapi in hostapis: + for device_idx in hostapi["devices"]: + devices[device_idx]["hostapi_name"] = hostapi["name"] + self.hostapis = [hostapi["name"] for hostapi in hostapis] + if hostapi_name not in self.hostapis: + hostapi_name = self.hostapis[0] + self.input_devices = [ + d["name"] + for d in devices + if d["max_input_channels"] > 0 and d["hostapi_name"] == hostapi_name + ] + self.output_devices = [ + d["name"] + for d in devices + if d["max_output_channels"] > 0 and d["hostapi_name"] == hostapi_name + ] + self.input_devices_indices = [ + d["index"] if "index" in d else d["name"] + for d in devices + if d["max_input_channels"] > 0 and d["hostapi_name"] == hostapi_name + ] + self.output_devices_indices = [ + d["index"] if "index" in d else d["name"] + for d in devices + if d["max_output_channels"] > 0 and d["hostapi_name"] == hostapi_name + ] + + def set_devices(self, input_device, output_device): + """设置输出设备""" + sd.default.device[0] = self.input_devices_indices[ + self.input_devices.index(input_device) + ] + sd.default.device[1] = self.output_devices_indices[ + self.output_devices.index(output_device) + ] + printt("Input device: %s:%s", str(sd.default.device[0]), input_device) + printt("Output device: %s:%s", str(sd.default.device[1]), output_device) + + def get_device_samplerate(self): + return int( + sd.query_devices(device=sd.default.device[0])["default_samplerate"] + ) + + def get_device_channels(self): + max_input_channels = sd.query_devices(device=sd.default.device[0])[ + "max_input_channels" + ] + max_output_channels = sd.query_devices(device=sd.default.device[1])[ + "max_output_channels" + ] + return min(max_input_channels, max_output_channels, 2) + + gui = GUI() diff --git a/launch_soprano_rvc.sh b/launch_soprano_rvc.sh new file mode 100755 index 0000000..3a8a700 --- /dev/null +++ b/launch_soprano_rvc.sh @@ -0,0 +1,260 @@ +#!/bin/bash +# Soprano TTS to RVC Pipeline Launcher +# This script helps you set up and run the soprano->RVC pipeline + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV_PATH="$SCRIPT_DIR/.venv" +RVC_DIR="$SCRIPT_DIR/Retrieval-based-Voice-Conversion-WebUI" +RVC_GUI="$RVC_DIR/gui_v1.py" +SOPRANO_SCRIPT="$SCRIPT_DIR/soprano_to_virtual_sink.py" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Print colored output +print_info() { + echo -e "${BLUE}ℹ ${NC}$1" +} + +print_success() { + echo -e "${GREEN}✓${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}⚠${NC} $1" +} + +print_error() { + echo -e "${RED}✗${NC} $1" +} + +print_header() { + echo "" + echo -e "${BLUE}═══════════════════════════════════════════════════════════════════${NC}" + echo -e "${BLUE} $1${NC}" + echo -e "${BLUE}═══════════════════════════════════════════════════════════════════${NC}" + echo "" +} + +# Check prerequisites +check_prerequisites() { + print_header "Checking Prerequisites" + + # Check if virtual environment exists + if [ ! -d "$VENV_PATH" ]; then + print_error "Virtual environment not found at: $VENV_PATH" + exit 1 + fi + print_success "Virtual environment found" + + # Check if RVC GUI exists + if [ ! -f "$RVC_GUI" ]; then + print_error "RVC GUI not found at: $RVC_GUI" + exit 1 + fi + print_success "RVC GUI found" + + # Check if soprano script exists + if [ ! -f "$SOPRANO_SCRIPT" ]; then + print_error "Soprano script not found at: $SOPRANO_SCRIPT" + exit 1 + fi + print_success "Soprano script found" + + # Check if pactl is available (PulseAudio) + if ! command -v pactl &> /dev/null; then + print_error "pactl (PulseAudio) not found. Please install PulseAudio." + exit 1 + fi + print_success "PulseAudio found" +} + +# Display usage instructions +show_usage() { + print_header "Soprano TTS to RVC Pipeline" + + echo "This script helps you run a text-to-speech pipeline where:" + echo " 1. You type text into the Soprano TTS script" + echo " 2. Soprano generates speech and outputs to a virtual sink" + echo " 3. RVC reads from that virtual sink and applies voice conversion" + echo " 4. RVC outputs the converted voice to your speakers/headphones" + echo "" + echo "Usage:" + echo " $0 [option]" + echo "" + echo "Options:" + echo " soprano - Start only the Soprano TTS virtual sink script" + echo " rvc - Start only the RVC realtime GUI" + echo " both - Start both in separate terminal windows (default)" + echo " help - Show this help message" + echo "" +} + +# Start soprano script +start_soprano() { + print_header "Starting Soprano TTS Virtual Sink" + + print_info "Activating virtual environment..." + source "$VENV_PATH/bin/activate" + + print_info "Starting soprano_to_virtual_sink.py..." + print_info "This will create a virtual sink: soprano_to_rvc" + echo "" + + python "$SOPRANO_SCRIPT" +} + +# Start RVC GUI +start_rvc() { + print_header "Starting RVC Realtime GUI" + + print_info "Activating virtual environment..." + source "$VENV_PATH/bin/activate" + + print_info "Changing to RVC directory..." + cd "$RVC_DIR" + + print_info "Starting RVC GUI..." + echo "" + print_warning "IMPORTANT: In the RVC GUI, select 'soprano_to_rvc.monitor' as your INPUT device!" + echo "" + sleep 2 + + python "$RVC_GUI" +} + +# Start both in separate terminals +start_both() { + print_header "Starting Both Components" + + print_info "This will open two terminal windows:" + print_info " 1. Soprano TTS Virtual Sink (for text input)" + print_info " 2. RVC Realtime GUI (for voice conversion)" + echo "" + + # Detect terminal emulator + TERMINAL="" + if command -v gnome-terminal &> /dev/null; then + TERMINAL="gnome-terminal" + elif command -v konsole &> /dev/null; then + TERMINAL="konsole" + elif command -v xfce4-terminal &> /dev/null; then + TERMINAL="xfce4-terminal" + elif command -v alacritty &> /dev/null; then + TERMINAL="alacritty" + elif command -v kitty &> /dev/null; then + TERMINAL="kitty" + elif command -v xterm &> /dev/null; then + TERMINAL="xterm" + else + print_error "No suitable terminal emulator found" + print_info "Please start the components manually:" + print_info " Terminal 1: $0 soprano" + print_info " Terminal 2: $0 rvc" + exit 1 + fi + + print_success "Using terminal: $TERMINAL" + + # Start soprano in new terminal + print_info "Starting Soprano TTS in new terminal..." + case "$TERMINAL" in + gnome-terminal) + gnome-terminal -- bash -c "cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash" & + ;; + konsole) + konsole -e bash -c "cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash" & + ;; + xfce4-terminal) + xfce4-terminal -e "bash -c \"cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash\"" & + ;; + alacritty) + alacritty -e bash -c "cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash" & + ;; + kitty) + kitty bash -c "cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash" & + ;; + xterm) + xterm -e bash -c "cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash" & + ;; + esac + + sleep 2 + + # Start RVC in new terminal + print_info "Starting RVC GUI in new terminal..." + case "$TERMINAL" in + gnome-terminal) + gnome-terminal -- bash -c "cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash" & + ;; + konsole) + konsole -e bash -c "cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash" & + ;; + xfce4-terminal) + xfce4-terminal -e "bash -c \"cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash\"" & + ;; + alacritty) + alacritty -e bash -c "cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash" & + ;; + kitty) + kitty bash -c "cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash" & + ;; + xterm) + xterm -e bash -c "cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash" & + ;; + esac + + echo "" + print_success "Both components started in separate terminals" + echo "" + print_header "Quick Setup Guide" + echo "1. In the RVC GUI window:" + echo " - Select your RVC model (.pth file)" + echo " - Select the corresponding index file" + echo " - Choose 'soprano_to_rvc.monitor' as INPUT device" + echo " - Choose your speakers/headphones as OUTPUT device" + echo " - Click 'Start Voice Conversion'" + echo "" + echo "2. In the Soprano TTS window:" + echo " - Type any text you want to convert" + echo " - Press Enter to generate and stream" + echo "" + echo "3. Listen to the RVC-converted output!" + echo "" + print_info "Press Ctrl+C in each terminal to stop" + echo "" +} + +# Main script +main() { + case "${1:-both}" in + soprano) + check_prerequisites + start_soprano + ;; + rvc) + check_prerequisites + start_rvc + ;; + both) + check_prerequisites + start_both + ;; + help|--help|-h) + show_usage + ;; + *) + print_error "Unknown option: $1" + show_usage + exit 1 + ;; + esac +} + +main "$@" diff --git a/python-version.txt b/python-version.txt new file mode 100644 index 0000000..eae0123 --- /dev/null +++ b/python-version.txt @@ -0,0 +1 @@ +3.10.19 diff --git a/requirements.lock.txt b/requirements.lock.txt new file mode 100644 index 0000000..67f26d7 --- /dev/null +++ b/requirements.lock.txt @@ -0,0 +1,159 @@ +absl-py==2.3.1 +accelerate==1.12.0 +aiofiles==23.2.1 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +altair==5.5.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +antlr4-python3-runtime==4.8 +anyio==4.12.1 +async-timeout==5.0.1 +attrs==25.4.0 +audioread==3.1.0 +av==16.1.0 +bitarray==3.8.0 +brotli==1.2.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +colorama==0.4.6 +coloredlogs==15.0.1 +contourpy==1.3.2 +cycler==0.12.1 +Cython==3.2.4 +decorator==5.2.1 +einops==0.8.1 +exceptiongroup==1.3.1 +fairseq==0.12.2 +faiss-cpu==1.7.3 +fastapi==0.88.0 +ffmpeg-python==0.2.0 +ffmpy==0.3.1 +filelock==3.20.0 +flatbuffers==25.12.19 +fonttools==4.61.1 +frozenlist==1.8.0 +fsspec==2025.12.0 +future==1.0.0 +gradio==3.48.0 +gradio_client==0.6.1 +groovy==0.1.2 +grpcio==1.76.0 +h11==0.16.0 +hf-xet==1.2.0 +httpcore==1.0.9 +httpx==0.28.1 +huggingface-hub==0.36.0 +humanfriendly==10.0 +hydra-core==1.0.7 +hyper-connections==0.4.0 +idna==3.11 +importlib_resources==6.5.2 +inflect==7.5.0 +Jinja2==3.1.3 +joblib==1.5.3 +json5==0.13.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +kiwisolver==1.4.9 +lazy_loader==0.4 +librosa==0.10.2 +linkify-it-py==2.0.3 +llvmlite==0.39.0 +local-attention==1.11.2 +lxml==6.0.2 +Markdown==3.10 +markdown-it-py==2.2.0 +MarkupSafe==2.1.5 +matplotlib==3.10.8 +matplotlib-inline==0.2.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +more-itertools==10.8.0 +mpmath==1.3.0 +msgpack==1.1.2 +multidict==6.7.0 +narwhals==2.15.0 +networkx==3.4.2 +numba==0.56.4 +numpy==1.23.5 +omegaconf==2.0.6 +onnxruntime==1.23.2 +onnxruntime-gpu==1.23.2 +orjson==3.11.5 +packaging==25.0 +pandas==2.3.3 +pillow==10.4.0 +platformdirs==4.5.1 +pooch==1.8.2 +portalocker==3.2.0 +praat-parselmouth==0.4.7 +propcache==0.4.1 +protobuf==6.33.3 +psutil==7.2.1 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +pycparser==2.23 +pydantic==1.10.26 +pydantic_core==2.41.5 +pydub==0.25.1 +Pygments==2.19.2 +pyparsing==3.3.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-multipart==0.0.21 +pytorch-triton-rocm==3.1.0 +pytz==2025.2 +pyworld==0.3.2 +PyYAML==6.0.3 +referencing==0.37.0 +regex==2025.11.3 +requests==2.32.5 +resampy==0.4.3 +rich==14.2.0 +rpds-py==0.30.0 +sacrebleu==2.5.1 +safehttpx==0.1.7 +safetensors==0.7.0 +scikit-learn==1.7.2 +scipy==1.15.3 +semantic-version==2.10.0 +shellingham==1.5.4 +six==1.17.0 +-e git+https://github.com/ekwek1/soprano.git@5c759351f9e115aa364d5f4453ddaa7ee0d6f15e#egg=soprano_tts +sounddevice==0.5.3 +soundfile==0.13.1 +soxr==1.0.0 +starlette==0.22.0 +sympy==1.13.1 +tabulate==0.9.0 +tensorboard==2.20.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.4 +threadpoolctl==3.6.0 +tokenizers==0.22.2 +tomlkit==0.13.3 +torch==2.5.1+rocm6.2 +torchaudio==2.5.1+rocm6.2 +torchcrepe==0.0.23 +torchfcpe==0.0.4 +torchvision==0.20.1+rocm6.2 +tornado==6.5.4 +tqdm==4.67.1 +traitlets==5.14.3 +transformers==4.57.3 +typeguard==4.4.4 +typer==0.21.1 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +tzdata==2025.3 +uc-micro-py==1.0.3 +Unidecode==1.4.0 +urllib3==2.6.3 +uvicorn==0.40.0 +websockets==11.0.3 +Werkzeug==3.1.5 +yarl==1.22.0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c8a01b8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,86 @@ +# ========================================================== +# Unified Soprano + RVC environment +# Python == 3.10.19 +# ROCm == 6.2 +# ========================================================== + +# ---------------------- +# Core ML / GPU stack +# ---------------------- +torch==2.5.1+rocm6.2 +torchaudio==2.5.1+rocm6.2 +torchvision==0.20.1+rocm6.2 +pytorch-triton-rocm==3.1.0 + +# ---------------------- +# Numerical stack (RVC-safe) +# ---------------------- +numpy==1.23.5 +scipy==1.15.3 +scikit-learn==1.7.2 + +# ---------------------- +# Audio processing +# ---------------------- +sounddevice==0.5.3 +soundfile==0.13.1 +pydub==0.25.1 +librosa==0.10.2 +soxr==1.0.0 +resampy==0.4.3 +praat-parselmouth==0.4.7 +pyworld==0.3.2 +av==16.1.0 + +# ---------------------- +# RVC core +# ---------------------- +fairseq==0.12.2 +faiss-cpu==1.7.3 +numba==0.56.4 +llvmlite==0.39.0 +torchcrepe==0.0.23 +torchfcpe==0.0.4 +einops==0.8.1 +local-attention==1.11.2 +omegaconf==2.0.6 +hydra-core==1.0.7 + +# ---------------------- +# Soprano TTS +# ---------------------- +transformers==4.57.3 +accelerate==1.12.0 +tokenizers==0.22.2 +safetensors==0.7.0 +huggingface-hub==0.36.0 +inflect==7.5.0 +Unidecode==1.4.0 + +# ---------------------- +# Web / UI +# ---------------------- +fastapi==0.88.0 +starlette==0.22.0 +uvicorn==0.40.0 +gradio==3.48.0 +gradio_client==0.6.1 +python-multipart==0.0.21 +orjson==3.11.5 + +# ---------------------- +# Utilities +# ---------------------- +tqdm==4.67.1 +rich==14.2.0 +psutil==7.2.1 +requests==2.32.5 +regex==2025.11.3 +filelock==3.20.0 +packaging==25.0 +PyYAML==6.0.3 + +# ---------------------- +# Editable installs (local) +# ---------------------- +-e git+https://github.com/ekwek1/soprano.git@5c759351f9e115aa364d5f4453ddaa7ee0d6f15e#egg=soprano_tts diff --git a/setup_alsa_bridge.sh b/setup_alsa_bridge.sh new file mode 100755 index 0000000..36ee2a4 --- /dev/null +++ b/setup_alsa_bridge.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Setup script to make soprano_to_rvc available as an ALSA device for RVC + +ASOUND_RC="$HOME/.asoundrc" +SINK_NAME="soprano_to_rvc" + +echo "Setting up ALSA configuration for soprano_to_rvc..." + +# Backup existing .asoundrc if it exists +if [ -f "$ASOUND_RC" ]; then + cp "$ASOUND_RC" "${ASOUND_RC}.backup.$(date +%s)" + echo "✓ Backed up existing .asoundrc" +fi + +# Check if our configuration already exists +if grep -q "pcm.soprano_rvc" "$ASOUND_RC" 2>/dev/null; then + echo "✓ Configuration already exists in .asoundrc" +else + echo "Adding ALSA configuration..." + + cat >> "$ASOUND_RC" << 'EOF' + +# Soprano to RVC bridge +pcm.soprano_rvc { + type pulse + device soprano_to_rvc.monitor + hint { + show on + description "Soprano TTS to RVC Bridge" + } +} + +ctl.soprano_rvc { + type pulse + device soprano_to_rvc.monitor +} +EOF + + echo "✓ Added ALSA configuration to .asoundrc" +fi + +echo "" +echo "=" * 70 +echo "Setup complete!" +echo "" +echo "The virtual device 'soprano_rvc' is now available as an ALSA device." +echo "" +echo "In RVC GUI:" +echo " 1. Set device type to 'ALSA'" +echo " 2. Select 'soprano_rvc' or 'Soprano TTS to RVC Bridge' as input" +echo " 3. Make sure the soprano_to_virtual_sink.py script is running" +echo "" diff --git a/soprano b/soprano new file mode 160000 index 0000000..5c75935 --- /dev/null +++ b/soprano @@ -0,0 +1 @@ +Subproject commit 5c759351f9e115aa364d5f4453ddaa7ee0d6f15e diff --git a/soprano_to_virtual_sink.py b/soprano_to_virtual_sink.py new file mode 100755 index 0000000..284c2e8 --- /dev/null +++ b/soprano_to_virtual_sink.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +""" +Soprano TTS to Virtual Sink +This script takes text input and streams Soprano TTS output to a virtual PulseAudio sink +that can be used as input for RVC realtime voice conversion. +""" + +import sys +import os +import subprocess +import signal +import sounddevice as sd +import numpy as np +import torch +from scipy import signal as scipy_signal + +# Add soprano to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'soprano')) +from soprano import SopranoTTS + +# Configuration +VIRTUAL_SINK_NAME = "soprano_to_rvc" +SAMPLE_RATE = 48000 # Use 48kHz for better compatibility with audio systems +SOPRANO_RATE = 32000 # Soprano outputs at 32kHz +CHANNELS = 2 # Use stereo to match RVC expectations + +# Global flag for graceful shutdown +running = True + + +def signal_handler(sig, frame): + """Handle Ctrl+C gracefully""" + global running + print("\n\nShutting down gracefully...") + running = False + + +def create_virtual_sink(): + """Create a PulseAudio virtual sink for audio output""" + # Check if sink already exists + try: + result = subprocess.run( + ["pactl", "list", "sinks", "short"], + capture_output=True, + text=True, + check=True + ) + if VIRTUAL_SINK_NAME in result.stdout: + print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' already exists") + print(f" Monitor source: {VIRTUAL_SINK_NAME}.monitor") + return True + except subprocess.CalledProcessError: + pass + + print(f"Creating virtual sink: {VIRTUAL_SINK_NAME}") + try: + # Create a null sink (virtual audio device) at 48kHz for compatibility + subprocess.run([ + "pactl", "load-module", "module-null-sink", + f"sink_name={VIRTUAL_SINK_NAME}", + f"sink_properties=device.description={VIRTUAL_SINK_NAME}", + f"rate={SAMPLE_RATE}", + "channels=2" # Stereo to match RVC expectations + ], check=True, capture_output=True) + print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' created successfully") + print(f" Monitor source: {VIRTUAL_SINK_NAME}.monitor") + return True + except subprocess.CalledProcessError as e: + print(f"✗ Failed to create virtual sink: {e.stderr.decode()}") + return False + + +def remove_virtual_sink(): + """Remove the virtual sink on exit""" + print(f"\nRemoving virtual sink: {VIRTUAL_SINK_NAME}") + try: + # Find the module ID + result = subprocess.run( + ["pactl", "list", "modules", "short"], + capture_output=True, + text=True, + check=True + ) + for line in result.stdout.split('\n'): + if VIRTUAL_SINK_NAME in line: + module_id = line.split()[0] + subprocess.run(["pactl", "unload-module", module_id], check=True) + print(f"✓ Virtual sink removed") + return + except Exception as e: + print(f"✗ Error removing virtual sink: {e}") + + +def get_virtual_sink_device_id(): + """Get the sounddevice ID for our virtual sink""" + # Force refresh device list + sd._terminate() + sd._initialize() + + devices = sd.query_devices() + for i, device in enumerate(devices): + if VIRTUAL_SINK_NAME in device['name']: + return i + return None + + +def stream_to_virtual_sink(tts_model, text, chunk_size=1): + """Stream soprano TTS output to the virtual sink""" + device_id = get_virtual_sink_device_id() + + if device_id is None: + print(f"✗ Could not find virtual sink device: {VIRTUAL_SINK_NAME}") + print(f"⚠️ Attempting to recreate virtual sink...") + if create_virtual_sink(): + # Wait a moment for the device to appear + import time + time.sleep(1.0) # Increased wait time + device_id = get_virtual_sink_device_id() + if device_id is None: + print(f"✗ Still could not find virtual sink after recreation") + print(f"\n📋 Available devices:") + devices = sd.query_devices() + for i, dev in enumerate(devices): + if 'soprano' in dev['name'].lower() or 'rvc' in dev['name'].lower(): + print(f" {i}: {dev['name']}") + return False + else: + return False + + device_info = sd.query_devices(device_id) + print(f"✓ Using output device: {device_info['name']}") + + # Get the device's default sample rate if 32kHz isn't supported + device_sr = int(device_info.get('default_samplerate', SAMPLE_RATE)) + if device_sr == 0 or device_sr != SAMPLE_RATE: + device_sr = SAMPLE_RATE # Try with soprano's rate anyway + + print(f" Sample rate: {device_sr} Hz") + print(f"\n🎤 Generating and streaming speech...") + print(f"Text: \"{text}\"\n") + + try: + # Generate streaming audio from soprano + stream = tts_model.infer_stream(text, chunk_size=chunk_size) + + # Open output stream to virtual sink + with sd.OutputStream( + samplerate=SAMPLE_RATE, + channels=CHANNELS, + dtype='float32', + device=device_id, + blocksize=0 + ) as out_stream: + first_chunk = True + for chunk in stream: + if not running: + break + + if first_chunk: + print("✓ First audio chunk generated and streaming started") + first_chunk = False + + # Convert torch tensor to numpy if needed + if isinstance(chunk, torch.Tensor): + chunk = chunk.detach().cpu().numpy() + + # Ensure correct shape for mono audio + if chunk.ndim == 1: + chunk_1d = chunk + elif chunk.ndim == 2 and chunk.shape[0] == 1: + chunk_1d = chunk.flatten() + elif chunk.ndim == 2 and chunk.shape[1] == 1: + chunk_1d = chunk.flatten() + else: + chunk_1d = chunk.flatten() + + # Check for invalid values before resampling + if not np.all(np.isfinite(chunk_1d)): + print(f"⚠️ Warning: Invalid values in soprano output, cleaning...") + chunk_1d = np.nan_to_num(chunk_1d, nan=0.0, posinf=1.0, neginf=-1.0) + + # Resample from 32kHz (Soprano) to 48kHz (output) if needed + if SOPRANO_RATE != SAMPLE_RATE: + num_samples = int(len(chunk_1d) * SAMPLE_RATE / SOPRANO_RATE) + chunk_resampled = scipy_signal.resample(chunk_1d, num_samples) + else: + chunk_resampled = chunk_1d + + # Ensure no NaN or inf values after resampling (clip to valid range) + if not np.all(np.isfinite(chunk_resampled)): + print(f"⚠️ Warning: Invalid values after resampling, cleaning...") + chunk_resampled = np.nan_to_num(chunk_resampled, nan=0.0, posinf=1.0, neginf=-1.0) + chunk_resampled = np.clip(chunk_resampled, -1.0, 1.0) + + # Reshape to (N, 2) for stereo output (duplicate mono to both channels) + chunk_stereo = np.column_stack((chunk_resampled, chunk_resampled)).astype(np.float32) + + # Write to virtual sink + out_stream.write(chunk_stereo) + + print("✓ Speech generation and streaming completed") + return True + + except Exception as e: + print(f"✗ Error during streaming: {e}") + import traceback + traceback.print_exc() + return False + + +def main(): + """Main function""" + global running + + # Set up signal handler for graceful shutdown + signal.signal(signal.SIGINT, signal_handler) + + print("=" * 70) + print("Soprano TTS to Virtual Sink for RVC") + print("=" * 70) + print() + + # Create virtual sink + if not create_virtual_sink(): + print("\n⚠️ If sink already exists, removing and recreating...") + remove_virtual_sink() + if not create_virtual_sink(): + print("✗ Failed to create virtual sink. Exiting.") + return 1 + + print() + print("=" * 70) + print("Virtual sink setup complete!") + print("=" * 70) + print() + print("📝 Next steps:") + print(f" 1. Open RVC realtime GUI (gui_v1.py)") + print(f" 2. Select '{VIRTUAL_SINK_NAME}.monitor' as the INPUT device") + print(f" 3. Select your desired output device") + print(f" 4. Load your RVC model and start conversion") + print(f" 5. Return here and type text to convert") + print() + print("=" * 70) + print() + + # Initialize Soprano TTS + print("🔄 Loading Soprano TTS model...") + try: + tts = SopranoTTS( + backend='auto', + device='auto', + cache_size_mb=100, + decoder_batch_size=1 + ) + print("✓ Soprano TTS model loaded successfully") + except Exception as e: + print(f"✗ Failed to load Soprano TTS: {e}") + remove_virtual_sink() + return 1 + + print() + print("=" * 70) + print("Ready! Type text to generate speech (Ctrl+C to exit)") + print("=" * 70) + print() + + # Main loop - get text input and generate speech + try: + while running: + try: + text = input("\n🎙️ Enter text: ").strip() + + if not text: + print("⚠️ Please enter some text") + continue + + if text.lower() in ['quit', 'exit', 'q']: + break + + # Stream the text to the virtual sink + stream_to_virtual_sink(tts, text, chunk_size=1) + print() + + except EOFError: + break + + except KeyboardInterrupt: + print("\n\n⚠️ Interrupted by user") + + finally: + # Clean up + remove_virtual_sink() + print("\n✓ Cleanup complete. Goodbye!") + + return 0 + + +if __name__ == "__main__": + sys.exit(main())