Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.

2026-01-20 23:06:17 +02:00
parent 362108f4b0
commit 2934efba22
31 changed files with 5408 additions and 357 deletions
--- a/stt/stt_server.py
+++ b/stt/stt_server.py
@@ -51,6 +51,9 @@ class UserSTTSession:
        self.timestamp_ms = 0.0
        self.transcript_buffer = []
        self.last_transcript = ""
+        self.last_partial_duration = 0.0  # Track when we last sent a partial
+        self.last_speech_timestamp = 0.0  # Track last time we detected speech
+        self.speech_timeout_ms = 3000  # Force finalization after 3s of no new speech
        
        logger.info(f"Created STT session for user {user_id}")
    
@@ -75,6 +78,8 @@ class UserSTTSession:
            event_type = vad_event["event"]
            probability = vad_event["probability"]
            
+            logger.debug(f"VAD event for user {self.user_id}: {event_type} (prob={probability:.3f})")
+            
            # Send VAD event to client
            await self.websocket.send_json({
                "type": "vad",
@@ -88,63 +93,91 @@ class UserSTTSession:
            if event_type == "speech_start":
                self.is_speaking = True
                self.audio_buffer = [audio_np]
-                logger.debug(f"User {self.user_id} started speaking")
+                self.last_partial_duration = 0.0
+                self.last_speech_timestamp = self.timestamp_ms
+                logger.info(f"[STT] User {self.user_id} SPEECH START")
            
            elif event_type == "speaking":
                if self.is_speaking:
                    self.audio_buffer.append(audio_np)
+                    self.last_speech_timestamp = self.timestamp_ms  # Update speech timestamp
                    
-                    # Transcribe partial every ~2 seconds for streaming
+                    # Transcribe partial every ~1 second for streaming (reduced from 2s)
                    total_samples = sum(len(chunk) for chunk in self.audio_buffer)
                    duration_s = total_samples / 16000
                    
-                    if duration_s >= 2.0:
+                    # More frequent partials for better responsiveness
+                    if duration_s >= 1.0:
+                        logger.debug(f"Triggering partial transcription at {duration_s:.1f}s")
                        await self._transcribe_partial()
+                        # Keep buffer for final transcription, but mark progress
+                        self.last_partial_duration = duration_s
            
            elif event_type == "speech_end":
                self.is_speaking = False
                
+                logger.info(f"[STT] User {self.user_id} SPEECH END (VAD detected) - transcribing final")
+                
                # Transcribe final
                await self._transcribe_final()
                
                # Clear buffer
                self.audio_buffer = []
+                self.last_partial_duration = 0.0
                logger.debug(f"User {self.user_id} stopped speaking")
        
        else:
-            # Still accumulate audio if speaking
+            # No VAD event - still accumulate audio if speaking
            if self.is_speaking:
                self.audio_buffer.append(audio_np)
+                
+                # Check for timeout
+                time_since_speech = self.timestamp_ms - self.last_speech_timestamp
+                
+                if time_since_speech >= self.speech_timeout_ms:
+                    # Timeout - user probably stopped but VAD didn't detect it
+                    logger.warning(f"[STT] User {self.user_id} SPEECH TIMEOUT after {time_since_speech:.0f}ms - forcing finalization")
+                    self.is_speaking = False
+                    
+                    # Force final transcription
+                    await self._transcribe_final()
+                    
+                    # Clear buffer
+                    self.audio_buffer = []
+                    self.last_partial_duration = 0.0
    
    async def _transcribe_partial(self):
-        """Transcribe accumulated audio and send partial result with word tokens."""
+        """Transcribe accumulated audio and send partial result (no timestamps to save VRAM)."""
        if not self.audio_buffer:
            return
        
        # Concatenate audio
        audio_full = np.concatenate(self.audio_buffer)
        
-        # Transcribe asynchronously with word-level timestamps
+        # Transcribe asynchronously WITHOUT timestamps for partials (saves 1-2GB VRAM)
        try:
            result = await parakeet_transcriber.transcribe_async(
                audio_full,
                sample_rate=16000,
-                return_timestamps=True
+                return_timestamps=False  # Disable timestamps for partials to reduce VRAM usage
            )
            
-            if result and result.get("text") and result["text"] != self.last_transcript:
-                self.last_transcript = result["text"]
+            # Result is just a string when timestamps=False
+            text = result if isinstance(result, str) else result.get("text", "")
+            
+            if text and text != self.last_transcript:
+                self.last_transcript = text
                
-                # Send partial transcript with word tokens for LLM pre-computation
+                # Send partial transcript without word tokens (saves memory)
                await self.websocket.send_json({
                    "type": "partial",
-                    "text": result["text"],
-                    "words": result.get("words", []),  # Word-level tokens
+                    "text": text,
+                    "words": [],  # No word tokens for partials
                    "user_id": self.user_id,
                    "timestamp": self.timestamp_ms
                })
                
-                logger.info(f"Partial [{self.user_id}]: {result['text']}")
+                logger.info(f"Partial [{self.user_id}]: {text}")
        
        except Exception as e:
            logger.error(f"Partial transcription failed: {e}", exc_info=True)
@@ -220,8 +253,8 @@ async def startup_event():
    vad_processor = VADProcessor(
        sample_rate=16000,
        threshold=0.5,
-        min_speech_duration_ms=250,  # Conservative
-        min_silence_duration_ms=500   # Conservative
+        min_speech_duration_ms=250,  # Conservative - wait 250ms before starting
+        min_silence_duration_ms=300   # Reduced from 500ms - detect silence faster
    )
    logger.info("✓ VAD ready")