Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.

This commit is contained in:
2026-01-20 23:06:17 +02:00
parent 362108f4b0
commit 2934efba22
31 changed files with 5408 additions and 357 deletions

View File

@@ -49,6 +49,15 @@ class ParakeetTranscriber:
logger.info(f"Loading Parakeet model: {model_name} on {device}...")
# Set PyTorch memory allocator settings for better memory management
if device == "cuda":
# Enable expandable segments to reduce fragmentation
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# Clear cache before loading model
torch.cuda.empty_cache()
# Load model via NeMo from HuggingFace
self.model = EncDecRNNTBPEModel.from_pretrained(
model_name=model_name,
@@ -58,6 +67,11 @@ class ParakeetTranscriber:
self.model.eval()
if device == "cuda":
self.model = self.model.cuda()
# Enable memory efficient attention if available
try:
self.model.encoder.use_memory_efficient_attention = True
except:
pass
# Thread pool for blocking transcription calls
self.executor = ThreadPoolExecutor(max_workers=2)
@@ -119,7 +133,7 @@ class ParakeetTranscriber:
# Transcribe using NeMo model
with torch.no_grad():
# Convert to tensor
# Convert to tensor and keep on GPU to avoid CPU/GPU bouncing
audio_signal = torch.from_numpy(audio).unsqueeze(0)
audio_signal_len = torch.tensor([len(audio)])
@@ -127,12 +141,14 @@ class ParakeetTranscriber:
audio_signal = audio_signal.cuda()
audio_signal_len = audio_signal_len.cuda()
# Get transcription with timestamps
# NeMo returns list of Hypothesis objects when timestamps=True
# Get transcription
# NeMo returns list of Hypothesis objects
# Note: timestamps=True causes significant VRAM usage (~1-2GB extra)
# Only enable for final transcriptions, not streaming partials
transcriptions = self.model.transcribe(
audio=[audio_signal.squeeze(0).cpu().numpy()],
audio=[audio], # Pass NumPy array directly (NeMo handles it efficiently)
batch_size=1,
timestamps=True # Enable timestamps to get word-level data
timestamps=return_timestamps # Only use timestamps when explicitly requested
)
# Extract text from Hypothesis object
@@ -144,9 +160,9 @@ class ParakeetTranscriber:
# Hypothesis object has .text attribute
text = hypothesis.text.strip() if hasattr(hypothesis, 'text') else str(hypothesis).strip()
# Extract word-level timestamps if available
# Extract word-level timestamps if available and requested
words = []
if hasattr(hypothesis, 'timestamp') and hypothesis.timestamp:
if return_timestamps and hasattr(hypothesis, 'timestamp') and hypothesis.timestamp:
# timestamp is a dict with 'word' key containing list of word timestamps
word_timestamps = hypothesis.timestamp.get('word', [])
for word_info in word_timestamps:
@@ -165,6 +181,10 @@ class ParakeetTranscriber:
}
else:
return text
# Note: We do NOT call torch.cuda.empty_cache() here
# That breaks PyTorch's memory allocator and causes fragmentation
# Let PyTorch manage its own memory pool
async def transcribe_streaming(
self,