Voice conversion pipeline (Soprano TTS → RVC) with Docker support. Previously tracked as bare gitlink; removed .git/ directories and absorbed into main repo for unified tracking. Includes: Soprano TTS, RVC WebUI integration, Docker configs, WebSocket API, and benchmark scripts. Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index). 287 files (3.1GB of ML weights properly excluded via gitignore).
241 lines
7.7 KiB
Python
241 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Gradio Web Interface for Soprano TTS
|
|
"""
|
|
|
|
import argparse
|
|
import socket
|
|
import time
|
|
import gradio as gr
|
|
import numpy as np
|
|
from soprano import SopranoTTS
|
|
from soprano.utils.streaming import play_stream
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='Soprano Text-to-Speech Gradio WebUI')
|
|
parser.add_argument('--model-path', '-m',
|
|
help='Path to local model directory (optional)')
|
|
parser.add_argument('--device', '-d', default='auto',
|
|
choices=['auto', 'cuda', 'cpu', 'mps'],
|
|
help='Device to use for inference')
|
|
parser.add_argument('--backend', '-b', default='auto',
|
|
choices=['auto', 'transformers', 'lmdeploy'],
|
|
help='Backend to use for inference')
|
|
parser.add_argument('--cache-size', '-c', type=int, default=100,
|
|
help='Cache size in MB (for lmdeploy backend)')
|
|
parser.add_argument('--decoder-batch-size', '-bs', type=int, default=1,
|
|
help='Batch size when decoding audio')
|
|
args = parser.parse_args()
|
|
|
|
# Initialize model
|
|
print("Loading Soprano TTS model...")
|
|
model = SopranoTTS(
|
|
backend=args.backend,
|
|
device=args.device,
|
|
cache_size_mb=args.cache_size,
|
|
decoder_batch_size=args.decoder_batch_size,
|
|
model_path=args.model_path
|
|
)
|
|
device = model.device
|
|
backend = model.backend
|
|
print("Model loaded successfully!")
|
|
|
|
SAMPLE_RATE = 32000
|
|
|
|
|
|
def generate_speech(
|
|
text: str,
|
|
temperature: float,
|
|
top_p: float,
|
|
repetition_penalty: float,
|
|
chunk_size: int,
|
|
streaming: bool,
|
|
):
|
|
if not text.strip():
|
|
yield None, "Please enter some text to generate speech."
|
|
return
|
|
|
|
try:
|
|
if streaming:
|
|
stream = model.infer_stream(
|
|
text,
|
|
temperature=temperature,
|
|
top_p=top_p,
|
|
repetition_penalty=repetition_penalty,
|
|
chunk_size=chunk_size,
|
|
)
|
|
yield None, "⏳ Streaming..."
|
|
|
|
latency = play_stream(stream)
|
|
|
|
yield None, (
|
|
f"✓ Streaming complete | "
|
|
f"{latency*1000:.2f} ms latency"
|
|
)
|
|
return
|
|
|
|
start_time = time.perf_counter()
|
|
|
|
audio = model.infer(
|
|
text,
|
|
temperature=temperature,
|
|
top_p=top_p,
|
|
repetition_penalty=repetition_penalty,
|
|
)
|
|
|
|
gen_time = time.perf_counter() - start_time
|
|
|
|
audio_np = audio.cpu().numpy()
|
|
audio_int16 = (audio_np * 32767).astype(np.int16)
|
|
|
|
audio_seconds = len(audio_np) / SAMPLE_RATE
|
|
rtf = audio_seconds / gen_time if gen_time > 0 else float("inf")
|
|
|
|
status = (
|
|
f"✓ Generated {audio_seconds:.2f} s audio | "
|
|
f"Generation time: {gen_time:.3f} s "
|
|
f"({rtf:.2f}x realtime)"
|
|
)
|
|
|
|
yield (SAMPLE_RATE, audio_int16), status
|
|
return
|
|
|
|
except Exception as e:
|
|
yield None, f"✗ Error: {str(e)}"
|
|
|
|
|
|
# Create Gradio interface
|
|
with gr.Blocks(title="Soprano TTS") as demo:
|
|
gr.Markdown(
|
|
f"""# 🗣️ Soprano TTS
|
|
|
|
<div align="center">
|
|
<img width="300" height="300" alt="soprano-github" src="https://github.com/user-attachments/assets/4d612eac-23b8-44e6-8c59-d7ac14ebafd1" />
|
|
</div>
|
|
|
|
**Device:** {device.upper()} | **Backend:** {backend}
|
|
|
|
**Model Weights:** https://huggingface.co/ekwek/Soprano-1.1-80M
|
|
**Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
|
|
**GitHub:** https://github.com/ekwek1/soprano
|
|
"""
|
|
)
|
|
with gr.Row():
|
|
with gr.Column(scale=2):
|
|
text_input = gr.Textbox(
|
|
label="Text to Synthesize",
|
|
placeholder="Enter text here...",
|
|
value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
|
|
lines=5,
|
|
max_lines=10,
|
|
)
|
|
streaming = gr.Checkbox(
|
|
label="Stream Audio",
|
|
value=False,
|
|
info="Note: This bypasses the Gradio interface and streams audio directly to your speaker."
|
|
)
|
|
with gr.Accordion("Advanced Settings", open=False):
|
|
temperature = gr.Slider(
|
|
minimum=0.0,
|
|
maximum=1.0,
|
|
value=0.0,
|
|
step=0.05,
|
|
label="Temperature",
|
|
)
|
|
top_p = gr.Slider(
|
|
minimum=0.5,
|
|
maximum=1.0,
|
|
value=0.95,
|
|
step=0.05,
|
|
label="Top P",
|
|
)
|
|
repetition_penalty = gr.Slider(
|
|
minimum=1.0,
|
|
maximum=2.0,
|
|
value=1.2,
|
|
step=0.1,
|
|
label="Repetition Penalty",
|
|
)
|
|
chunk_size = gr.Slider(
|
|
minimum=1,
|
|
maximum=10,
|
|
value=1,
|
|
step=1,
|
|
precision=0,
|
|
label="Chunk Size (Streaming only)",
|
|
)
|
|
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
|
|
with gr.Column(scale=1):
|
|
audio_output = gr.Audio(
|
|
label="Generated Speech",
|
|
type="numpy",
|
|
autoplay=True,
|
|
)
|
|
status_output = gr.Textbox(
|
|
label="Status",
|
|
interactive=False,
|
|
lines=3,
|
|
max_lines=10
|
|
)
|
|
gr.Examples(
|
|
examples=[
|
|
["Soprano is an extremely lightweight text to speech model.", 0.0, 0.95, 1.2],
|
|
["Artificial intelligence is transforming the world.", 0.0, 0.95, 1.2],
|
|
["I'm so excited, I can't even wait!", 0.0, 0.95, 1.2],
|
|
["Why don't you go ahead and try it?", 0.0, 0.95, 1.2],
|
|
],
|
|
inputs=[text_input, temperature, top_p, repetition_penalty],
|
|
label="Example Prompts",
|
|
)
|
|
generate_btn.click(
|
|
fn=generate_speech,
|
|
inputs=[text_input, temperature, top_p, repetition_penalty, chunk_size, streaming],
|
|
outputs=[audio_output, status_output],
|
|
)
|
|
gr.Markdown(
|
|
f"""
|
|
### Usage tips:
|
|
|
|
- Soprano works best when each sentence is between 2 and 30 seconds long.
|
|
- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
|
|
Best results can be achieved by converting these into their phonetic form.
|
|
(1+1 -> one plus one, etc)
|
|
- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
|
|
You may also change the sampling settings for more varied results.
|
|
- Avoid improper grammar such as not using contractions, multiple spaces, etc.
|
|
"""
|
|
)
|
|
|
|
|
|
def find_free_port(start_port=7860, max_tries=100):
|
|
for port in range(start_port, start_port + max_tries):
|
|
try:
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
s.bind(("", port))
|
|
return port
|
|
except OSError:
|
|
continue
|
|
raise OSError("Could not find a free port")
|
|
|
|
def main():
|
|
# Start Gradio interface
|
|
port = find_free_port(7860)
|
|
print(f"Starting Gradio interface on port {port}")
|
|
demo.launch(
|
|
server_name="0.0.0.0",
|
|
server_port=port,
|
|
share=False,
|
|
theme=gr.themes.Soft(primary_hue="green"),
|
|
css="""
|
|
a {
|
|
color: var(--primary-600);
|
|
}
|
|
a:hover {
|
|
color: var(--primary-700);
|
|
}
|
|
"""
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|