Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.
This commit is contained in:
@@ -51,6 +51,9 @@ class UserSTTSession:
|
||||
self.timestamp_ms = 0.0
|
||||
self.transcript_buffer = []
|
||||
self.last_transcript = ""
|
||||
self.last_partial_duration = 0.0 # Track when we last sent a partial
|
||||
self.last_speech_timestamp = 0.0 # Track last time we detected speech
|
||||
self.speech_timeout_ms = 3000 # Force finalization after 3s of no new speech
|
||||
|
||||
logger.info(f"Created STT session for user {user_id}")
|
||||
|
||||
@@ -75,6 +78,8 @@ class UserSTTSession:
|
||||
event_type = vad_event["event"]
|
||||
probability = vad_event["probability"]
|
||||
|
||||
logger.debug(f"VAD event for user {self.user_id}: {event_type} (prob={probability:.3f})")
|
||||
|
||||
# Send VAD event to client
|
||||
await self.websocket.send_json({
|
||||
"type": "vad",
|
||||
@@ -88,63 +93,91 @@ class UserSTTSession:
|
||||
if event_type == "speech_start":
|
||||
self.is_speaking = True
|
||||
self.audio_buffer = [audio_np]
|
||||
logger.debug(f"User {self.user_id} started speaking")
|
||||
self.last_partial_duration = 0.0
|
||||
self.last_speech_timestamp = self.timestamp_ms
|
||||
logger.info(f"[STT] User {self.user_id} SPEECH START")
|
||||
|
||||
elif event_type == "speaking":
|
||||
if self.is_speaking:
|
||||
self.audio_buffer.append(audio_np)
|
||||
self.last_speech_timestamp = self.timestamp_ms # Update speech timestamp
|
||||
|
||||
# Transcribe partial every ~2 seconds for streaming
|
||||
# Transcribe partial every ~1 second for streaming (reduced from 2s)
|
||||
total_samples = sum(len(chunk) for chunk in self.audio_buffer)
|
||||
duration_s = total_samples / 16000
|
||||
|
||||
if duration_s >= 2.0:
|
||||
# More frequent partials for better responsiveness
|
||||
if duration_s >= 1.0:
|
||||
logger.debug(f"Triggering partial transcription at {duration_s:.1f}s")
|
||||
await self._transcribe_partial()
|
||||
# Keep buffer for final transcription, but mark progress
|
||||
self.last_partial_duration = duration_s
|
||||
|
||||
elif event_type == "speech_end":
|
||||
self.is_speaking = False
|
||||
|
||||
logger.info(f"[STT] User {self.user_id} SPEECH END (VAD detected) - transcribing final")
|
||||
|
||||
# Transcribe final
|
||||
await self._transcribe_final()
|
||||
|
||||
# Clear buffer
|
||||
self.audio_buffer = []
|
||||
self.last_partial_duration = 0.0
|
||||
logger.debug(f"User {self.user_id} stopped speaking")
|
||||
|
||||
else:
|
||||
# Still accumulate audio if speaking
|
||||
# No VAD event - still accumulate audio if speaking
|
||||
if self.is_speaking:
|
||||
self.audio_buffer.append(audio_np)
|
||||
|
||||
# Check for timeout
|
||||
time_since_speech = self.timestamp_ms - self.last_speech_timestamp
|
||||
|
||||
if time_since_speech >= self.speech_timeout_ms:
|
||||
# Timeout - user probably stopped but VAD didn't detect it
|
||||
logger.warning(f"[STT] User {self.user_id} SPEECH TIMEOUT after {time_since_speech:.0f}ms - forcing finalization")
|
||||
self.is_speaking = False
|
||||
|
||||
# Force final transcription
|
||||
await self._transcribe_final()
|
||||
|
||||
# Clear buffer
|
||||
self.audio_buffer = []
|
||||
self.last_partial_duration = 0.0
|
||||
|
||||
async def _transcribe_partial(self):
|
||||
"""Transcribe accumulated audio and send partial result with word tokens."""
|
||||
"""Transcribe accumulated audio and send partial result (no timestamps to save VRAM)."""
|
||||
if not self.audio_buffer:
|
||||
return
|
||||
|
||||
# Concatenate audio
|
||||
audio_full = np.concatenate(self.audio_buffer)
|
||||
|
||||
# Transcribe asynchronously with word-level timestamps
|
||||
# Transcribe asynchronously WITHOUT timestamps for partials (saves 1-2GB VRAM)
|
||||
try:
|
||||
result = await parakeet_transcriber.transcribe_async(
|
||||
audio_full,
|
||||
sample_rate=16000,
|
||||
return_timestamps=True
|
||||
return_timestamps=False # Disable timestamps for partials to reduce VRAM usage
|
||||
)
|
||||
|
||||
if result and result.get("text") and result["text"] != self.last_transcript:
|
||||
self.last_transcript = result["text"]
|
||||
# Result is just a string when timestamps=False
|
||||
text = result if isinstance(result, str) else result.get("text", "")
|
||||
|
||||
if text and text != self.last_transcript:
|
||||
self.last_transcript = text
|
||||
|
||||
# Send partial transcript with word tokens for LLM pre-computation
|
||||
# Send partial transcript without word tokens (saves memory)
|
||||
await self.websocket.send_json({
|
||||
"type": "partial",
|
||||
"text": result["text"],
|
||||
"words": result.get("words", []), # Word-level tokens
|
||||
"text": text,
|
||||
"words": [], # No word tokens for partials
|
||||
"user_id": self.user_id,
|
||||
"timestamp": self.timestamp_ms
|
||||
})
|
||||
|
||||
logger.info(f"Partial [{self.user_id}]: {result['text']}")
|
||||
logger.info(f"Partial [{self.user_id}]: {text}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Partial transcription failed: {e}", exc_info=True)
|
||||
@@ -220,8 +253,8 @@ async def startup_event():
|
||||
vad_processor = VADProcessor(
|
||||
sample_rate=16000,
|
||||
threshold=0.5,
|
||||
min_speech_duration_ms=250, # Conservative
|
||||
min_silence_duration_ms=500 # Conservative
|
||||
min_speech_duration_ms=250, # Conservative - wait 250ms before starting
|
||||
min_silence_duration_ms=300 # Reduced from 500ms - detect silence faster
|
||||
)
|
||||
logger.info("✓ VAD ready")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user