miku-discord/stt-parakeet/client/mic_stream.py

"""
Microphone streaming client for ASR WebSocket server
"""
import asyncio
import websockets
import sounddevice as sd
import numpy as np
import json
import logging
import queue
from typing import Optional

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class MicrophoneStreamClient:
    """
    Client for streaming microphone audio to ASR WebSocket server.
    """

    def __init__(
        self,
        server_url: str = "ws://localhost:8766",
        sample_rate: int = 16000,
        channels: int = 1,
        chunk_duration: float = 0.1,  # seconds
        device: Optional[int] = None,
    ):
        """
        Initialize microphone streaming client.

        Args:
            server_url: WebSocket server URL
            sample_rate: Audio sample rate (16000 Hz recommended)
            channels: Number of audio channels (1 for mono)
            chunk_duration: Duration of each audio chunk in seconds
            device: Optional audio input device index
        """
        self.server_url = server_url
        self.sample_rate = sample_rate
        self.channels = channels
        self.chunk_duration = chunk_duration
        self.chunk_samples = int(sample_rate * chunk_duration)
        self.device = device

        self.audio_queue = queue.Queue()
        self.is_recording = False
        self.websocket = None

        logger.info(f"Microphone client initialized")
        logger.info(f"Server URL: {server_url}")
        logger.info(f"Sample rate: {sample_rate} Hz")
        logger.info(f"Chunk duration: {chunk_duration}s ({self.chunk_samples} samples)")

    def audio_callback(self, indata, frames, time_info, status):
        """
        Callback for sounddevice stream.

        Args:
            indata: Input audio data
            frames: Number of frames
            time_info: Timing information
            status: Status flags
        """
        if status:
            logger.warning(f"Audio callback status: {status}")

        # Convert to int16 and put in queue
        audio_data = (indata[:, 0] * 32767).astype(np.int16)
        self.audio_queue.put(audio_data.tobytes())

    async def send_audio(self):
        """
        Coroutine to send audio from queue to WebSocket.
        """
        while self.is_recording:
            try:
                # Get audio data from queue (non-blocking)
                audio_bytes = self.audio_queue.get_nowait()

                if self.websocket:
                    await self.websocket.send(audio_bytes)

            except queue.Empty:
                # No audio data available, wait a bit
                await asyncio.sleep(0.01)
            except Exception as e:
                logger.error(f"Error sending audio: {e}")
                break

    async def receive_transcripts(self):
        """
        Coroutine to receive transcripts from WebSocket.
        """
        while self.is_recording:
            try:
                if self.websocket:
                    message = await asyncio.wait_for(
                        self.websocket.recv(),
                        timeout=0.1
                    )

                    try:
                        data = json.loads(message)

                        if data.get("type") == "transcript":
                            text = data.get("text", "")
                            is_final = data.get("is_final", False)

                            if is_final:
                                logger.info(f"[FINAL] {text}")
                            else:
                                logger.info(f"[PARTIAL] {text}")

                        elif data.get("type") == "info":
                            logger.info(f"Server: {data.get('message')}")

                        elif data.get("type") == "error":
                            logger.error(f"Server error: {data.get('message')}")

                    except json.JSONDecodeError:
                        logger.warning(f"Invalid JSON response: {message}")

            except asyncio.TimeoutError:
                continue
            except Exception as e:
                logger.error(f"Error receiving transcript: {e}")
                break

    async def stream_audio(self):
        """
        Main coroutine to stream audio to server.
        """
        try:
            async with websockets.connect(self.server_url) as websocket:
                self.websocket = websocket
                logger.info(f"Connected to server: {self.server_url}")

                self.is_recording = True

                # Start audio stream
                with sd.InputStream(
                    samplerate=self.sample_rate,
                    channels=self.channels,
                    dtype=np.float32,
                    blocksize=self.chunk_samples,
                    device=self.device,
                    callback=self.audio_callback,
                ):
                    logger.info("Recording started. Press Ctrl+C to stop.")

                    # Run send and receive coroutines concurrently
                    await asyncio.gather(
                        self.send_audio(),
                        self.receive_transcripts(),
                    )

        except websockets.exceptions.WebSocketException as e:
            logger.error(f"WebSocket error: {e}")
        except KeyboardInterrupt:
            logger.info("Stopped by user")
        finally:
            self.is_recording = False

            # Send final command
            if self.websocket:
                try:
                    await self.websocket.send(json.dumps({"type": "final"}))
                    await asyncio.sleep(0.5)  # Wait for final response
                except:
                    pass

            self.websocket = None
            logger.info("Disconnected from server")

    def run(self):
        """
        Run the client (blocking).
        """
        try:
            asyncio.run(self.stream_audio())
        except KeyboardInterrupt:
            logger.info("Client stopped by user")


def list_audio_devices():
    """
    List available audio input devices.
    """
    print("\nAvailable audio input devices:")
    print("-" * 80)
    devices = sd.query_devices()
    for i, device in enumerate(devices):
        if device['max_input_channels'] > 0:
            print(f"[{i}] {device['name']}")
            print(f"    Channels: {device['max_input_channels']}")
            print(f"    Sample rate: {device['default_samplerate']} Hz")
    print("-" * 80)


def main():
    """
    Main entry point for the microphone client.
    """
    import argparse

    parser = argparse.ArgumentParser(description="Microphone Streaming Client")
    parser.add_argument("--url", default="ws://localhost:8766", help="WebSocket server URL")
    parser.add_argument("--sample-rate", type=int, default=16000, help="Audio sample rate")
    parser.add_argument("--device", type=int, default=None, help="Audio input device index")
    parser.add_argument("--list-devices", action="store_true", help="List audio devices and exit")
    parser.add_argument("--chunk-duration", type=float, default=0.1, help="Audio chunk duration (seconds)")

    args = parser.parse_args()

    if args.list_devices:
        list_audio_devices()
        return

    client = MicrophoneStreamClient(
        server_url=args.url,
        sample_rate=args.sample_rate,
        device=args.device,
        chunk_duration=args.chunk_duration,
    )

    client.run()


if __name__ == "__main__":
    main()