Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.

2026-01-20 23:06:17 +02:00
parent 362108f4b0
commit 2934efba22
31 changed files with 5408 additions and 357 deletions
--- a/stt/parakeet_transcriber.py
+++ b/stt/parakeet_transcriber.py
@@ -49,6 +49,15 @@ class ParakeetTranscriber:
        
        logger.info(f"Loading Parakeet model: {model_name} on {device}...")
        
+        # Set PyTorch memory allocator settings for better memory management
+        if device == "cuda":
+            # Enable expandable segments to reduce fragmentation
+            import os
+            os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+            
+            # Clear cache before loading model
+            torch.cuda.empty_cache()
+        
        # Load model via NeMo from HuggingFace
        self.model = EncDecRNNTBPEModel.from_pretrained(
            model_name=model_name,
@@ -58,6 +67,11 @@ class ParakeetTranscriber:
        self.model.eval()
        if device == "cuda":
            self.model = self.model.cuda()
+            # Enable memory efficient attention if available
+            try:
+                self.model.encoder.use_memory_efficient_attention = True
+            except:
+                pass
        
        # Thread pool for blocking transcription calls
        self.executor = ThreadPoolExecutor(max_workers=2)
@@ -119,7 +133,7 @@ class ParakeetTranscriber:
        
        # Transcribe using NeMo model
        with torch.no_grad():
-            # Convert to tensor
+            # Convert to tensor and keep on GPU to avoid CPU/GPU bouncing
            audio_signal = torch.from_numpy(audio).unsqueeze(0)
            audio_signal_len = torch.tensor([len(audio)])
            
@@ -127,12 +141,14 @@ class ParakeetTranscriber:
                audio_signal = audio_signal.cuda()
                audio_signal_len = audio_signal_len.cuda()
            
-            # Get transcription with timestamps
-            # NeMo returns list of Hypothesis objects when timestamps=True
+            # Get transcription
+            # NeMo returns list of Hypothesis objects
+            # Note: timestamps=True causes significant VRAM usage (~1-2GB extra)
+            # Only enable for final transcriptions, not streaming partials
            transcriptions = self.model.transcribe(
-                audio=[audio_signal.squeeze(0).cpu().numpy()],
+                audio=[audio],  # Pass NumPy array directly (NeMo handles it efficiently)
                batch_size=1,
-                timestamps=True  # Enable timestamps to get word-level data
+                timestamps=return_timestamps  # Only use timestamps when explicitly requested
            )
            
            # Extract text from Hypothesis object
@@ -144,9 +160,9 @@ class ParakeetTranscriber:
                # Hypothesis object has .text attribute
                text = hypothesis.text.strip() if hasattr(hypothesis, 'text') else str(hypothesis).strip()
                
-                # Extract word-level timestamps if available
+                # Extract word-level timestamps if available and requested
                words = []
-                if hasattr(hypothesis, 'timestamp') and hypothesis.timestamp:
+                if return_timestamps and hasattr(hypothesis, 'timestamp') and hypothesis.timestamp:
                    # timestamp is a dict with 'word' key containing list of word timestamps
                    word_timestamps = hypothesis.timestamp.get('word', [])
                    for word_info in word_timestamps:
@@ -165,6 +181,10 @@ class ParakeetTranscriber:
                }
            else:
                return text
+            
+            # Note: We do NOT call torch.cuda.empty_cache() here
+            # That breaks PyTorch's memory allocator and causes fragmentation
+            # Let PyTorch manage its own memory pool
    
    async def transcribe_streaming(
        self,