refactor: Implement low-latency STT pipeline with speculative transcription

Major architectural overhaul of the speech-to-text pipeline for real-time voice chat: STT Server Rewrite: - Replaced RealtimeSTT dependency with direct Silero VAD + Faster-Whisper integration - Achieved sub-second latency by eliminating unnecessary abstractions - Uses small.en Whisper model for fast transcription (~850ms) Speculative Transcription (NEW): - Start transcribing at 150ms silence (speculative) while still listening - If speech continues, discard speculative result and keep buffering - If 400ms silence confirmed, use pre-computed speculative result immediately - Reduces latency by ~250-850ms for typical utterances with clear pauses VAD Implementation: - Silero VAD with ONNX (CPU-efficient) for 32ms chunk processing - Direct speech boundary detection without RealtimeSTT overhead - Configurable thresholds for silence detection (400ms final, 150ms speculative) Architecture: - Single Whisper model loaded once, shared across sessions - VAD runs on every 512-sample chunk for immediate speech detection - Background transcription worker thread for non-blocking processing - Greedy decoding (beam_size=1) for maximum speed Performance: - Previous: 400ms silence wait + ~850ms transcription = ~1.25s total latency - Current: 400ms silence wait + 0ms (speculative ready) = ~400ms (best case) - Single model reduces VRAM usage, prevents OOM on GTX 1660 Container Manager Updates: - Updated health check logic to work with new response format - Changed from checking 'warmed_up' flag to just 'status: ready' - Improved terminology from 'warmup' to 'models loading' Files Changed: - stt-realtime/stt_server.py: Complete rewrite with Silero VAD + speculative transcription - stt-realtime/requirements.txt: Removed RealtimeSTT, using torch.hub for Silero VAD - bot/utils/container_manager.py: Updated health check for new STT response format - bot/api.py: Updated docstring to reflect new architecture - backups/: Archived old RealtimeSTT-based implementation This addresses low latency requirements while maintaining accuracy with configurable speech detection thresholds.
2026-01-22 22:08:07 +02:00
parent 2934efba22
commit eb03dfce4d
5 changed files with 850 additions and 400 deletions
--- a/stt-realtime/stt_server.py
+++ b/stt-realtime/stt_server.py
@@ -1,9 +1,14 @@
 #!/usr/bin/env python3
 """
-RealtimeSTT WebSocket Server
+Low-Latency STT WebSocket Server

-Provides real-time speech-to-text transcription using Faster-Whisper.
-Receives audio chunks via WebSocket and streams back partial/final transcripts.
+Uses Silero VAD for speech detection + Faster-Whisper turbo for transcription.
+Achieves sub-second latency after speech ends.
+
+Architecture:
+1. Silero VAD runs on every audio chunk to detect speech boundaries
+2. When speech ends (silence detected), immediately transcribe the buffer
+3. Send final transcript - no waiting for stability

 Protocol:
 - Client sends: binary audio data (16kHz, 16-bit mono PCM)
@@ -32,352 +37,357 @@ logging.basicConfig(
 )
 logger = logging.getLogger('stt-realtime')

-# Import RealtimeSTT
-from RealtimeSTT import AudioToTextRecorder
+# Silero VAD
+import torch
+torch.set_num_threads(1)  # Prevent thread contention

-# Global warmup state
+# Faster-Whisper for transcription
+from faster_whisper import WhisperModel
+
+# Global model (shared across sessions for memory efficiency)
+whisper_model: Optional[WhisperModel] = None
+vad_model = None
 warmup_complete = False
-warmup_lock = threading.Lock()
-warmup_recorder = None
+
+
+def load_vad_model():
+    """Load Silero VAD model."""
+    global vad_model
+    model, _ = torch.hub.load(
+        repo_or_dir='snakers4/silero-vad',
+        model='silero_vad',
+        force_reload=False,
+        onnx=True  # Use ONNX for speed
+    )
+    vad_model = model
+    logger.info("Silero VAD loaded (ONNX)")
+    return model
+
+
+def load_whisper_model(config: Dict[str, Any]):
+    """Load Faster-Whisper model."""
+    global whisper_model
+    whisper_model = WhisperModel(
+        config['model'],
+        device=config['device'],
+        compute_type=config['compute_type'],
+    )
+    logger.info(f"Faster-Whisper '{config['model']}' loaded on {config['device']}")
+    return whisper_model


 class STTSession:
    """
-    Manages a single STT session for a WebSocket client.
-    Uses RealtimeSTT's AudioToTextRecorder with feed_audio() method.
+    Low-latency STT session using Silero VAD + Faster-Whisper.
    """
    
+    SAMPLE_RATE = 16000
+    VAD_CHUNK_MS = 32  # Silero needs 512 samples at 16kHz = 32ms
+    VAD_CHUNK_SAMPLES = 512  # Fixed: Silero requires exactly 512 samples at 16kHz
+    
    def __init__(self, websocket, session_id: str, config: Dict[str, Any]):
        self.websocket = websocket
        self.session_id = session_id
        self.config = config
-        self.recorder: Optional[AudioToTextRecorder] = None
        self.running = False
-        self.audio_queue = queue.Queue()
-        self.feed_thread: Optional[threading.Thread] = None
-        self.last_partial = ""
-        self.last_stabilized = ""  # Track last stabilized partial
-        self.last_text_was_stabilized = False  # Track which came last
-        self.recording_active = False  # Track if currently recording
+        self.loop = None
        
-        logger.info(f"[{session_id}] Session created")
-    
-    def _on_realtime_transcription(self, text: str):
-        """Called when partial transcription is available."""
-        if text and text != self.last_partial:
-            self.last_partial = text
-            self.last_text_was_stabilized = False  # Partial came after stabilized
-            logger.info(f"[{self.session_id}] 📝 Partial: {text}")
-            asyncio.run_coroutine_threadsafe(
-                self._send_transcript("partial", text),
-                self.loop
-            )
-    
-    def _on_realtime_stabilized(self, text: str):
-        """Called when a stabilized partial is available (high confidence)."""
-        if text and text.strip():
-            self.last_stabilized = text
-            self.last_text_was_stabilized = True  # Stabilized came after partial
-            logger.info(f"[{self.session_id}] 🔒 Stabilized: {text}")
-            asyncio.run_coroutine_threadsafe(
-                self._send_transcript("partial", text),
-                self.loop
-            )
-    
-    def _on_recording_stop(self):
-        """Called when recording stops (silence detected)."""
-        logger.info(f"[{self.session_id}] ⏹️ Recording stopped")
-        self.recording_active = False
+        # Audio state
+        self.audio_buffer = []  # Float32 samples for current utterance
+        self.vad_buffer = []  # Small buffer for VAD chunk alignment
        
-        # Use the most recent text: prioritize whichever came last
-        if self.last_text_was_stabilized:
-            final_text = self.last_stabilized or self.last_partial
-            source = "stabilized" if self.last_stabilized else "partial"
-        else:
-            final_text = self.last_partial or self.last_stabilized
-            source = "partial" if self.last_partial else "stabilized"
+        # Speech detection state
+        self.is_speaking = False
+        self.silence_start_time = 0
+        self.speech_start_time = 0
        
-        if final_text:
-            logger.info(f"[{self.session_id}] ✅ Final (from {source}): {final_text}")
-            asyncio.run_coroutine_threadsafe(
-                self._send_transcript("final", final_text),
-                self.loop
-            )
-        else:
-            # No transcript means VAD false positive (detected "speech" in pure noise)
-            logger.warning(f"[{self.session_id}] ⚠️  Recording stopped but no transcript available (VAD false positive)")
-            logger.info(f"[{self.session_id}] 🔄 Clearing audio buffer to recover")
-            
-            # Clear the audio queue to prevent stale data
-            try:
-                while not self.audio_queue.empty():
-                    self.audio_queue.get_nowait()
-            except Exception:
-                pass
+        # Configurable thresholds
+        self.vad_threshold = config.get('vad_threshold', 0.5)
+        self.silence_duration_ms = config.get('silence_duration_ms', 400)
+        self.min_speech_ms = config.get('min_speech_ms', 250)
+        self.max_speech_duration = config.get('max_speech_duration', 30.0)
        
-        # Reset state
-        self.last_stabilized = ""
-        self.last_partial = ""
-        self.last_text_was_stabilized = False
-    
-    def _on_recording_start(self):
-        """Called when recording starts (speech detected)."""
-        logger.info(f"[{self.session_id}] 🎙️ Recording started")
-        self.recording_active = True
-        self.last_stabilized = ""
-        self.last_partial = ""
-    
-    def _on_transcription(self, text: str):
-        """Not used - we use stabilized partials as finals."""
-        pass
-    
-    async def _send_transcript(self, transcript_type: str, text: str):
-        """Send transcript to client via WebSocket."""
-        try:
-            message = {
-                "type": transcript_type,
-                "text": text,
-                "timestamp": time.time()
-            }
-            await self.websocket.send(json.dumps(message))
-        except Exception as e:
-            logger.error(f"[{self.session_id}] Failed to send transcript: {e}")
-    
-    def _feed_audio_thread(self):
-        """Thread that feeds audio to the recorder."""
-        logger.info(f"[{self.session_id}] Audio feed thread started")
-        while self.running:
-            try:
-                # Get audio chunk with timeout
-                audio_chunk = self.audio_queue.get(timeout=0.1)
-                if audio_chunk is not None and self.recorder:
-                    self.recorder.feed_audio(audio_chunk)
-            except queue.Empty:
-                continue
-            except Exception as e:
-                logger.error(f"[{self.session_id}] Error feeding audio: {e}")
-        logger.info(f"[{self.session_id}] Audio feed thread stopped")
+        # Speculative transcription settings
+        self.speculative_silence_ms = config.get('speculative_silence_ms', 150)  # Start transcribing early
+        self.speculative_pending = False  # Is a speculative transcription in flight?
+        self.speculative_audio_snapshot = None  # Audio buffer snapshot for speculative
+        self.speculative_result = None  # Result from speculative transcription
+        self.speculative_result_ready = threading.Event()
+        
+        # Transcription queue
+        self.transcribe_queue = queue.Queue()
+        self.transcribe_thread = None
+        
+        logger.info(f"[{session_id}] Session created (speculative: {self.speculative_silence_ms}ms, final: {self.silence_duration_ms}ms)")
    
    async def start(self, loop: asyncio.AbstractEventLoop):
-        """Start the STT session."""
+        """Start the session."""
        self.loop = loop
        self.running = True
        
-        logger.info(f"[{self.session_id}] Starting RealtimeSTT recorder...")
-        logger.info(f"[{self.session_id}] Model: {self.config['model']}")
-        logger.info(f"[{self.session_id}] Device: {self.config['device']}")
+        self.transcribe_thread = threading.Thread(target=self._transcription_worker, daemon=True)
+        self.transcribe_thread.start()
        
-        try:
-            # Create recorder in a thread to avoid blocking
-            def init_recorder():
-                self.recorder = AudioToTextRecorder(
-                    # Model settings - using same model for both partial and final
-                    model=self.config['model'],
-                    language=self.config['language'],
-                    compute_type=self.config['compute_type'],
-                    device=self.config['device'],
-                    
-                    # Disable microphone - we feed audio manually
-                    use_microphone=False,
-                    
-                    # Real-time transcription - use same model for everything
-                    enable_realtime_transcription=True,
-                    realtime_model_type=self.config['model'],  # Use same model
-                    realtime_processing_pause=0.05,  # 50ms between updates
-                    on_realtime_transcription_update=self._on_realtime_transcription,
-                    on_realtime_transcription_stabilized=self._on_realtime_stabilized,
-                    
-                    # VAD settings - very permissive, rely on Discord's VAD for speech detection
-                    # Our VAD is only for silence detection, not filtering audio content
-                    silero_sensitivity=0.05,  # Very low = barely filters anything
-                    silero_use_onnx=True,  # Faster
-                    webrtc_sensitivity=3,
-                    post_speech_silence_duration=self.config['silence_duration'],
-                    min_length_of_recording=self.config['min_recording_length'],
-                    min_gap_between_recordings=self.config['min_gap'],
-                    pre_recording_buffer_duration=1.0,  # Capture more audio before/after speech
-                    
-                    # Callbacks
-                    on_recording_start=self._on_recording_start,
-                    on_recording_stop=self._on_recording_stop,
-                    on_vad_detect_start=lambda: logger.debug(f"[{self.session_id}] VAD listening"),
-                    on_vad_detect_stop=lambda: logger.debug(f"[{self.session_id}] VAD stopped"),
-                    
-                    # Other settings
-                    spinner=False,  # No spinner in container
-                    level=logging.WARNING,  # Reduce internal logging
-                    
-                    # Beam search settings
-                    beam_size=5,  # Higher beam = better accuracy (used for final processing)
-                    beam_size_realtime=5,  # Increased from 3 for better real-time accuracy
-                    
-                    # Batch sizes
-                    batch_size=16,
-                    realtime_batch_size=8,
-                    
-                    initial_prompt="",  # Can add context here if needed
+        logger.info(f"[{self.session_id}] Session started")
+    
+    def _transcription_worker(self):
+        """Background thread that processes transcription requests."""
+        while self.running:
+            try:
+                item = self.transcribe_queue.get(timeout=0.1)
+                if item is None:
+                    continue
+                
+                audio_array, is_final, is_speculative = item
+                start_time = time.time()
+                
+                segments, info = whisper_model.transcribe(
+                    audio_array,
+                    language=self.config.get('language', 'en'),
+                    beam_size=1,
+                    best_of=1,
+                    temperature=0.0,
+                    vad_filter=False,
+                    without_timestamps=True,
                )
-                logger.info(f"[{self.session_id}] ✅ Recorder initialized")
-            
-            # Run initialization in thread pool
-            await asyncio.get_event_loop().run_in_executor(None, init_recorder)
-            
-            # Start audio feed thread
-            self.feed_thread = threading.Thread(target=self._feed_audio_thread, daemon=True)
-            self.feed_thread.start()
-            
-            # Start the recorder's text processing loop in a thread
-            def run_text_loop():
-                while self.running:
-                    try:
-                        # This blocks until speech is detected and transcribed
-                        text = self.recorder.text(self._on_transcription)
-                    except Exception as e:
-                        if self.running:
-                            logger.error(f"[{self.session_id}] Text loop error: {e}")
-                        break
-            
-            self.text_thread = threading.Thread(target=run_text_loop, daemon=True)
-            self.text_thread.start()
-            
-            logger.info(f"[{self.session_id}] ✅ Session started successfully")
-            
+                
+                text = " ".join(seg.text for seg in segments).strip()
+                elapsed = time.time() - start_time
+                
+                if is_speculative:
+                    # Store result for potential use
+                    self.speculative_result = (text, elapsed)
+                    self.speculative_result_ready.set()
+                    logger.debug(f"[{self.session_id}] SPECULATIVE ({elapsed:.2f}s): {text}")
+                elif text:
+                    transcript_type = "final" if is_final else "partial"
+                    logger.info(f"[{self.session_id}] {transcript_type.upper()} ({elapsed:.2f}s): {text}")
+                    
+                    asyncio.run_coroutine_threadsafe(
+                        self._send_transcript(transcript_type, text),
+                        self.loop
+                    )
+                
+            except queue.Empty:
+                continue
+            except Exception as e:
+                logger.error(f"[{self.session_id}] Transcription error: {e}", exc_info=True)
+    
+    async def _send_transcript(self, transcript_type: str, text: str):
+        """Send transcript to client."""
+        try:
+            await self.websocket.send(json.dumps({
+                "type": transcript_type,
+                "text": text,
+                "timestamp": time.time()
+            }))
        except Exception as e:
-            logger.error(f"[{self.session_id}] Failed to start session: {e}", exc_info=True)
-            raise
+            logger.error(f"[{self.session_id}] Send error: {e}")
    
    def feed_audio(self, audio_data: bytes):
-        """Feed audio data to the recorder."""
-        if self.running:
-            # Convert bytes to numpy array (16-bit PCM)
-            audio_np = np.frombuffer(audio_data, dtype=np.int16)
-            self.audio_queue.put(audio_np)
+        """Process incoming audio data."""
+        if not self.running:
+            return
+        
+        audio_int16 = np.frombuffer(audio_data, dtype=np.int16)
+        audio_float = audio_int16.astype(np.float32) / 32768.0
+        
+        self.vad_buffer.extend(audio_float)
+        
+        while len(self.vad_buffer) >= self.VAD_CHUNK_SAMPLES:
+            chunk = np.array(self.vad_buffer[:self.VAD_CHUNK_SAMPLES], dtype=np.float32)
+            self.vad_buffer = self.vad_buffer[self.VAD_CHUNK_SAMPLES:]
+            self._process_vad_chunk(chunk)
+    
+    def _process_vad_chunk(self, chunk: np.ndarray):
+        """Process a single VAD chunk."""
+        current_time = time.time()
+        
+        chunk_tensor = torch.from_numpy(chunk)
+        speech_prob = vad_model(chunk_tensor, self.SAMPLE_RATE).item()
+        
+        is_speech = speech_prob >= self.vad_threshold
+        
+        if is_speech:
+            if not self.is_speaking:
+                self.is_speaking = True
+                self.speech_start_time = current_time
+                self.audio_buffer = []
+                logger.debug(f"[{self.session_id}] Speech started")
+            
+            self.audio_buffer.extend(chunk)
+            self.silence_start_time = 0
+            
+            # Cancel any speculative transcription if speech resumed
+            if self.speculative_pending:
+                logger.debug(f"[{self.session_id}] Speech resumed, canceling speculative")
+                self.speculative_pending = False
+                self.speculative_result = None
+                self.speculative_result_ready.clear()
+            
+            speech_duration = current_time - self.speech_start_time
+            if speech_duration >= self.max_speech_duration:
+                logger.info(f"[{self.session_id}] Max duration reached")
+                self._finalize_utterance()
+        
+        else:
+            if self.is_speaking:
+                self.audio_buffer.extend(chunk)
+                
+                if self.silence_start_time == 0:
+                    self.silence_start_time = current_time
+                
+                silence_duration_ms = (current_time - self.silence_start_time) * 1000
+                speech_duration_ms = (self.silence_start_time - self.speech_start_time) * 1000
+                
+                # Trigger speculative transcription early
+                if (not self.speculative_pending and 
+                    silence_duration_ms >= self.speculative_silence_ms and
+                    speech_duration_ms >= self.min_speech_ms):
+                    self._start_speculative_transcription()
+                
+                # Final silence threshold reached
+                if silence_duration_ms >= self.silence_duration_ms:
+                    if speech_duration_ms >= self.min_speech_ms:
+                        logger.debug(f"[{self.session_id}] Speech ended ({speech_duration_ms:.0f}ms)")
+                        self._finalize_utterance()
+                    else:
+                        logger.debug(f"[{self.session_id}] Discarding short utterance")
+                        self._reset_state()
+    
+    def _start_speculative_transcription(self):
+        """Start speculative transcription without waiting for full silence."""
+        if self.audio_buffer:
+            self.speculative_pending = True
+            self.speculative_result = None
+            self.speculative_result_ready.clear()
+            
+            # Snapshot current buffer
+            audio_array = np.array(self.audio_buffer, dtype=np.float32)
+            duration = len(audio_array) / self.SAMPLE_RATE
+            
+            logger.debug(f"[{self.session_id}] Starting speculative transcription ({duration:.1f}s)")
+            # is_speculative=True
+            self.transcribe_queue.put((audio_array, False, True))
+    
+    def _finalize_utterance(self):
+        """Finalize current utterance and send transcript."""
+        if not self.audio_buffer:
+            self._reset_state()
+            return
+        
+        audio_array = np.array(self.audio_buffer, dtype=np.float32)
+        duration = len(audio_array) / self.SAMPLE_RATE
+        
+        # Check if we have a speculative result ready
+        if self.speculative_pending and self.speculative_result_ready.wait(timeout=0.05):
+            # Use speculative result immediately!
+            text, elapsed = self.speculative_result
+            if text:
+                logger.info(f"[{self.session_id}] FINAL [speculative] ({elapsed:.2f}s): {text}")
+                asyncio.run_coroutine_threadsafe(
+                    self._send_transcript("final", text),
+                    self.loop
+                )
+            self._reset_state()
+            return
+        
+        # No speculative result, do regular transcription
+        logger.info(f"[{self.session_id}] Queuing transcription ({duration:.1f}s)")
+        self.transcribe_queue.put((audio_array, True, False))
+        
+        self._reset_state()
+    
+    def _reset_state(self):
+        """Reset speech detection state."""
+        self.is_speaking = False
+        self.audio_buffer = []
+        self.silence_start_time = 0
+        self.speech_start_time = 0
+        self.speculative_pending = False
+        self.speculative_result = None
+        self.speculative_result_ready.clear()
    
    def reset(self):
-        """Reset the session state."""
-        logger.info(f"[{self.session_id}] Resetting session")
-        self.last_partial = ""
-        # Clear audio queue
-        while not self.audio_queue.empty():
-            try:
-                self.audio_queue.get_nowait()
-            except queue.Empty:
-                break
+        """Reset session state."""
+        logger.info(f"[{self.session_id}] Resetting")
+        self._reset_state()
+        self.vad_buffer = []
    
    async def stop(self):
-        """Stop the session and cleanup."""
-        logger.info(f"[{self.session_id}] Stopping session...")
+        """Stop the session."""
+        logger.info(f"[{self.session_id}] Stopping...")
        self.running = False
        
-        # Wait for threads to finish
-        if self.feed_thread and self.feed_thread.is_alive():
-            self.feed_thread.join(timeout=2)
+        if self.audio_buffer and self.is_speaking:
+            self._finalize_utterance()
        
-        # Shutdown recorder
-        if self.recorder:
-            try:
-                self.recorder.shutdown()
-            except Exception as e:
-                logger.error(f"[{self.session_id}] Error shutting down recorder: {e}")
+        if self.transcribe_thread and self.transcribe_thread.is_alive():
+            self.transcribe_thread.join(timeout=2)
        
-        logger.info(f"[{self.session_id}] Session stopped")
+        logger.info(f"[{self.session_id}] Stopped")


 class STTServer:
-    """
-    WebSocket server for RealtimeSTT.
-    Handles multiple concurrent clients (one per Discord user).
-    """
+    """WebSocket server for low-latency STT."""
    
-    def __init__(self, host: str = "0.0.0.0", port: int = 8766):
+    def __init__(self, host: str, port: int, config: Dict[str, Any]):
        self.host = host
        self.port = port
+        self.config = config
        self.sessions: Dict[str, STTSession] = {}
        self.session_counter = 0
        
-        # Default configuration
-        self.config = {
-            # Model - using small.en (English-only, more accurate than multilingual small)
-            'model': 'small.en',
-            'language': 'en',
-            'compute_type': 'float16',  # FP16 for GPU efficiency
-            'device': 'cuda',
-            
-            # VAD settings
-            'silero_sensitivity': 0.6,
-            'webrtc_sensitivity': 3,
-            'silence_duration': 0.8,  # Shorter to improve responsiveness
-            'min_recording_length': 0.5,
-            'min_gap': 0.3,
-        }
-        
        logger.info("=" * 60)
-        logger.info("RealtimeSTT Server Configuration:")
+        logger.info("Low-Latency STT Server")
        logger.info(f"  Host: {host}:{port}")
-        logger.info(f"  Model: {self.config['model']} (English-only, optimized)")
-        logger.info(f"  Beam size: 5 (higher accuracy)")
-        logger.info(f"  Strategy: Use last partial as final (instant response)")
-        logger.info(f"  Language: {self.config['language']}")
-        logger.info(f"  Device: {self.config['device']}")
-        logger.info(f"  Compute Type: {self.config['compute_type']}")
-        logger.info(f"  Silence Duration: {self.config['silence_duration']}s")
+        logger.info(f"  Model: {config['model']}")
+        logger.info(f"  Language: {config.get('language', 'en')}")
+        logger.info(f"  Silence: {config.get('silence_duration_ms', 400)}ms")
        logger.info("=" * 60)
    
    async def handle_client(self, websocket):
-        """Handle a WebSocket client connection."""
+        """Handle WebSocket client."""
        self.session_counter += 1
        session_id = f"session_{self.session_counter}"
        session = None
        
        try:
-            logger.info(f"[{session_id}] Client connected from {websocket.remote_address}")
+            logger.info(f"[{session_id}] Client connected")
            
-            # Create session
            session = STTSession(websocket, session_id, self.config)
            self.sessions[session_id] = session
-            
-            # Start session
            await session.start(asyncio.get_event_loop())
            
-            # Process messages
            async for message in websocket:
-                try:
-                    if isinstance(message, bytes):
-                        # Binary audio data
-                        session.feed_audio(message)
-                    else:
-                        # JSON command
+                if isinstance(message, bytes):
+                    session.feed_audio(message)
+                else:
+                    try:
                        data = json.loads(message)
-                        command = data.get('command', '')
-                        
-                        if command == 'reset':
+                        cmd = data.get('command', '')
+                        if cmd == 'reset':
                            session.reset()
-                        elif command == 'ping':
+                        elif cmd == 'ping':
                            await websocket.send(json.dumps({
                                'type': 'pong',
                                'timestamp': time.time()
                            }))
-                        else:
-                            logger.warning(f"[{session_id}] Unknown command: {command}")
-                            
-                except json.JSONDecodeError:
-                    logger.warning(f"[{session_id}] Invalid JSON message")
-                except Exception as e:
-                    logger.error(f"[{session_id}] Error processing message: {e}")
+                    except json.JSONDecodeError:
+                        pass
        
        except websockets.exceptions.ConnectionClosed:
            logger.info(f"[{session_id}] Client disconnected")
        except Exception as e:
            logger.error(f"[{session_id}] Error: {e}", exc_info=True)
        finally:
-            # Cleanup
            if session:
                await session.stop()
                del self.sessions[session_id]
    
    async def run(self):
-        """Run the WebSocket server."""
-        logger.info(f"Starting RealtimeSTT server on ws://{self.host}:{self.port}")
+        """Run the server."""
+        logger.info(f"Starting server on ws://{self.host}:{self.port}")
        
        async with serve(
            self.handle_client,
@@ -385,137 +395,83 @@ class STTServer:
            self.port,
            ping_interval=30,
            ping_timeout=10,
-            max_size=10 * 1024 * 1024,  # 10MB max message size
+            max_size=10 * 1024 * 1024,
        ):
-            logger.info("✅ Server ready and listening for connections")
-            await asyncio.Future()  # Run forever
+            logger.info("Server ready")
+            await asyncio.Future()


-async def warmup_model(config: Dict[str, Any]):
-    """
-    Warm up the STT model by loading it and processing test audio.
-    This ensures the model is cached in memory before handling real requests.
-    """
-    global warmup_complete, warmup_recorder
+async def warmup(config: Dict[str, Any]):
+    """Load models at startup."""
+    global warmup_complete
    
-    with warmup_lock:
-        if warmup_complete:
-            logger.info("Model already warmed up")
-            return
-        
-        logger.info("🔥 Starting model warmup...")
-        try:
-            # Generate silent test audio (1 second of silence, 16kHz)
-            test_audio = np.zeros(16000, dtype=np.int16)
-            
-            # Initialize a temporary recorder to load the model
-            logger.info("Loading Faster-Whisper model...")
-            
-            def dummy_callback(text):
-                pass
-            
-            # This will trigger model loading and compilation
-            warmup_recorder = AudioToTextRecorder(
-                model=config['model'],
-                language=config['language'],
-                compute_type=config['compute_type'],
-                device=config['device'],
-                silero_sensitivity=config['silero_sensitivity'],
-                webrtc_sensitivity=config['webrtc_sensitivity'],
-                post_speech_silence_duration=config['silence_duration'],
-                min_length_of_recording=config['min_recording_length'],
-                min_gap_between_recordings=config['min_gap'],
-                enable_realtime_transcription=True,
-                realtime_processing_pause=0.1,
-                on_realtime_transcription_update=dummy_callback,
-                on_realtime_transcription_stabilized=dummy_callback,
-                spinner=False,
-                level=logging.WARNING,
-                beam_size=5,
-                beam_size_realtime=5,
-                batch_size=16,
-                realtime_batch_size=8,
-                initial_prompt="",
-            )
-            
-            logger.info("✅ Model loaded and warmed up successfully")
-            warmup_complete = True
-            
-        except Exception as e:
-            logger.error(f"❌ Warmup failed: {e}", exc_info=True)
-            warmup_complete = False
+    logger.info("Loading models...")
+    
+    load_vad_model()
+    load_whisper_model(config)
+    
+    logger.info("Warming up transcription...")
+    dummy_audio = np.zeros(16000, dtype=np.float32)
+    segments, _ = whisper_model.transcribe(
+        dummy_audio,
+        language=config.get('language', 'en'),
+        beam_size=1,
+    )
+    list(segments)
+    
+    warmup_complete = True
+    logger.info("Warmup complete")


 async def health_handler(request):
-    """HTTP health check endpoint"""
+    """Health check endpoint."""
    if warmup_complete:
-        return web.json_response({
-            "status": "ready",
-            "warmed_up": True,
-            "model": "small.en",
-            "device": "cuda"
-        })
-    else:
-        return web.json_response({
-            "status": "warming_up",
-            "warmed_up": False,
-            "model": "small.en",
-            "device": "cuda"
-        }, status=503)
+        return web.json_response({"status": "ready"})
+    return web.json_response({"status": "warming_up"}, status=503)


-async def start_http_server(host: str, http_port: int):
-    """Start HTTP server for health checks"""
+async def start_http_server(host: str, port: int):
+    """Start HTTP health server."""
    app = web.Application()
    app.router.add_get('/health', health_handler)
-    
    runner = web.AppRunner(app)
    await runner.setup()
-    site = web.TCPSite(runner, host, http_port)
+    site = web.TCPSite(runner, host, port)
    await site.start()
-    
-    logger.info(f"✅ HTTP health server listening on http://{host}:{http_port}")
+    logger.info(f"Health server on http://{host}:{port}")


 def main():
    """Main entry point."""
    import os
    
-    # Get configuration from environment
    host = os.environ.get('STT_HOST', '0.0.0.0')
    port = int(os.environ.get('STT_PORT', '8766'))
-    http_port = int(os.environ.get('STT_HTTP_PORT', '8767'))  # HTTP health check port
+    http_port = int(os.environ.get('STT_HTTP_PORT', '8767'))
    
-    # Configuration
    config = {
        'model': 'small.en',
        'language': 'en',
        'compute_type': 'float16',
        'device': 'cuda',
-        'silero_sensitivity': 0.6,
-        'webrtc_sensitivity': 3,
-        'silence_duration': 0.8,
-        'min_recording_length': 0.5,
-        'min_gap': 0.3,
+        'vad_threshold': 0.5,
+        'silence_duration_ms': 400,  # Final silence threshold
+        'speculative_silence_ms': 150,  # Start transcribing early at 150ms
+        'min_speech_ms': 250,
+        'max_speech_duration': 30.0,
    }
    
-    # Create and run server
-    server = STTServer(host=host, port=port)
+    server = STTServer(host, port, config)
    
    async def run_all():
-        # Start warmup in background
-        asyncio.create_task(warmup_model(config))
-        
-        # Start HTTP health server
+        await warmup(config)
        asyncio.create_task(start_http_server(host, http_port))
-        
-        # Start WebSocket server
        await server.run()
    
    try:
        asyncio.run(run_all())
    except KeyboardInterrupt:
-        logger.info("Server shutdown requested")
+        logger.info("Shutdown requested")
    except Exception as e:
        logger.error(f"Server error: {e}", exc_info=True)
        raise