Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.

2026-01-19 00:29:44 +02:00
parent 0a8910fff8
commit 362108f4b0
34 changed files with 4593 additions and 73 deletions
--- a/stt-parakeet/server/display_server.py
+++ b/stt-parakeet/server/display_server.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+"""
+ASR WebSocket Server with Live Transcription Display
+
+This version displays transcriptions in real-time on the server console
+while clients stream audio from remote machines.
+"""
+import asyncio
+import websockets
+import numpy as np
+import json
+import logging
+import sys
+from datetime import datetime
+from pathlib import Path
+
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from asr.asr_pipeline import ASRPipeline
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('display_server.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+
+class DisplayServer:
+    """
+    WebSocket server with live transcription display.
+    """
+    
+    def __init__(
+        self,
+        host: str = "0.0.0.0",
+        port: int = 8766,
+        model_path: str = "models/parakeet",
+        sample_rate: int = 16000,
+    ):
+        """
+        Initialize server.
+        
+        Args:
+            host: Host address to bind to
+            port: Port to bind to
+            model_path: Directory containing model files
+            sample_rate: Audio sample rate
+        """
+        self.host = host
+        self.port = port
+        self.sample_rate = sample_rate
+        self.active_connections = set()
+        
+        # Terminal control codes
+        self.CLEAR_LINE = '\033[2K'
+        self.CURSOR_UP = '\033[1A'
+        self.BOLD = '\033[1m'
+        self.GREEN = '\033[92m'
+        self.YELLOW = '\033[93m'
+        self.BLUE = '\033[94m'
+        self.RESET = '\033[0m'
+        
+        # Initialize ASR pipeline
+        logger.info("Loading ASR model...")
+        self.pipeline = ASRPipeline(model_path=model_path)
+        logger.info("ASR Pipeline ready")
+        
+        # Client sessions
+        self.sessions = {}
+    
+    def print_header(self):
+        """Print server header."""
+        print("\n" + "=" * 80)
+        print(f"{self.BOLD}{self.BLUE}ASR Server - Live Transcription Display{self.RESET}")
+        print("=" * 80)
+        print(f"Server: ws://{self.host}:{self.port}")
+        print(f"Sample Rate: {self.sample_rate} Hz")
+        print(f"Model: Parakeet TDT 0.6B V3")
+        print("=" * 80 + "\n")
+    
+    def display_transcription(self, client_id: str, text: str, is_final: bool, is_progressive: bool = False):
+        """
+        Display transcription in the terminal.
+        
+        Args:
+            client_id: Client identifier
+            text: Transcribed text
+            is_final: Whether this is the final transcription
+            is_progressive: Whether this is a progressive update
+        """
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        
+        if is_final:
+            # Final transcription - bold green
+            print(f"{self.GREEN}{self.BOLD}[{timestamp}] {client_id}{self.RESET}")
+            print(f"{self.GREEN}  ✓ FINAL: {text}{self.RESET}\n")
+        elif is_progressive:
+            # Progressive update - yellow
+            print(f"{self.YELLOW}[{timestamp}] {client_id}{self.RESET}")
+            print(f"{self.YELLOW}  → {text}{self.RESET}\n")
+        else:
+            # Regular transcription
+            print(f"{self.BLUE}[{timestamp}] {client_id}{self.RESET}")
+            print(f"  {text}\n")
+        
+        # Flush to ensure immediate display
+        sys.stdout.flush()
+    
+    async def handle_client(self, websocket):
+        """
+        Handle individual WebSocket client connection.
+        
+        Args:
+            websocket: WebSocket connection
+        """
+        client_id = f"{websocket.remote_address[0]}:{websocket.remote_address[1]}"
+        logger.info(f"Client connected: {client_id}")
+        self.active_connections.add(websocket)
+        
+        # Display connection
+        print(f"\n{self.BOLD}{'='*80}{self.RESET}")
+        print(f"{self.GREEN}✓ Client connected: {client_id}{self.RESET}")
+        print(f"{self.BOLD}{'='*80}{self.RESET}\n")
+        sys.stdout.flush()
+        
+        # Audio buffer for accumulating ALL audio
+        all_audio = []
+        last_transcribed_samples = 0
+        
+        # For progressive transcription
+        min_chunk_duration = 2.0  # Minimum 2 seconds before transcribing
+        min_chunk_samples = int(self.sample_rate * min_chunk_duration)
+        
+        try:
+            # Send welcome message
+            await websocket.send(json.dumps({
+                "type": "info",
+                "message": "Connected to ASR server with live display",
+                "sample_rate": self.sample_rate,
+            }))
+            
+            async for message in websocket:
+                try:
+                    if isinstance(message, bytes):
+                        # Binary audio data
+                        audio_data = np.frombuffer(message, dtype=np.int16)
+                        audio_data = audio_data.astype(np.float32) / 32768.0
+                        
+                        # Accumulate all audio
+                        all_audio.append(audio_data)
+                        total_samples = sum(len(chunk) for chunk in all_audio)
+                        
+                        # Transcribe periodically when we have enough NEW audio
+                        samples_since_last = total_samples - last_transcribed_samples
+                        if samples_since_last >= min_chunk_samples:
+                            audio_chunk = np.concatenate(all_audio)
+                            last_transcribed_samples = total_samples
+                            
+                            # Transcribe the accumulated audio
+                            try:
+                                text = self.pipeline.transcribe(
+                                    audio_chunk,
+                                    sample_rate=self.sample_rate
+                                )
+                                
+                                if text and text.strip():
+                                    # Display on server
+                                    self.display_transcription(client_id, text, is_final=False, is_progressive=True)
+                                    
+                                    # Send to client
+                                    response = {
+                                        "type": "transcript",
+                                        "text": text,
+                                        "is_final": False,
+                                    }
+                                    await websocket.send(json.dumps(response))
+                            except Exception as e:
+                                logger.error(f"Transcription error: {e}")
+                                await websocket.send(json.dumps({
+                                    "type": "error",
+                                    "message": f"Transcription failed: {str(e)}"
+                                }))
+                    
+                    elif isinstance(message, str):
+                        # JSON command
+                        try:
+                            command = json.loads(message)
+                            
+                            if command.get("type") == "final":
+                                # Process all accumulated audio (final transcription)
+                                if all_audio:
+                                    audio_chunk = np.concatenate(all_audio)
+                                    
+                                    text = self.pipeline.transcribe(
+                                        audio_chunk,
+                                        sample_rate=self.sample_rate
+                                    )
+                                    
+                                    if text and text.strip():
+                                        # Display on server
+                                        self.display_transcription(client_id, text, is_final=True)
+                                        
+                                        # Send to client
+                                        response = {
+                                            "type": "transcript",
+                                            "text": text,
+                                            "is_final": True,
+                                        }
+                                        await websocket.send(json.dumps(response))
+                                    
+                                    # Clear buffer after final transcription
+                                    all_audio = []
+                                    last_transcribed_samples = 0
+                            
+                            elif command.get("type") == "reset":
+                                # Reset buffer
+                                all_audio = []
+                                last_transcribed_samples = 0
+                                await websocket.send(json.dumps({
+                                    "type": "info",
+                                    "message": "Buffer reset"
+                                }))
+                                print(f"{self.YELLOW}[{client_id}] Buffer reset{self.RESET}\n")
+                                sys.stdout.flush()
+                        
+                        except json.JSONDecodeError:
+                            logger.warning(f"Invalid JSON from {client_id}: {message}")
+                
+                except Exception as e:
+                    logger.error(f"Error processing message from {client_id}: {e}")
+                    break
+        
+        except websockets.exceptions.ConnectionClosed:
+            logger.info(f"Connection closed: {client_id}")
+        except Exception as e:
+            logger.error(f"Unexpected error with {client_id}: {e}")
+        finally:
+            self.active_connections.discard(websocket)
+            print(f"\n{self.BOLD}{'='*80}{self.RESET}")
+            print(f"{self.YELLOW}✗ Client disconnected: {client_id}{self.RESET}")
+            print(f"{self.BOLD}{'='*80}{self.RESET}\n")
+            sys.stdout.flush()
+            logger.info(f"Connection closed: {client_id}")
+    
+    async def start(self):
+        """Start the WebSocket server."""
+        self.print_header()
+        
+        async with websockets.serve(self.handle_client, self.host, self.port):
+            logger.info(f"Starting WebSocket server on {self.host}:{self.port}")
+            print(f"{self.GREEN}{self.BOLD}Server is running and ready for connections!{self.RESET}")
+            print(f"{self.BOLD}Waiting for clients...{self.RESET}\n")
+            sys.stdout.flush()
+            
+            # Keep server running
+            await asyncio.Future()
+
+
+def main():
+    """Main entry point."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="ASR Server with Live Display")
+    parser.add_argument("--host", default="0.0.0.0", help="Host address")
+    parser.add_argument("--port", type=int, default=8766, help="Port number")
+    parser.add_argument("--model-path", default="models/parakeet", help="Model directory")
+    parser.add_argument("--sample-rate", type=int, default=16000, help="Sample rate")
+    
+    args = parser.parse_args()
+    
+    server = DisplayServer(
+        host=args.host,
+        port=args.port,
+        model_path=args.model_path,
+        sample_rate=args.sample_rate,
+    )
+    
+    try:
+        asyncio.run(server.start())
+    except KeyboardInterrupt:
+        print(f"\n\n{server.YELLOW}Server stopped by user{server.RESET}")
+        logger.info("Server stopped by user")
+
+
+if __name__ == "__main__":
+    main()