miku-discord/bot/routes/chat.py

"""Chat streaming endpoint (SSE) with LLM integration."""

import json
import aiohttp
from fastapi import APIRouter
from fastapi.responses import StreamingResponse
import globals
from routes.models import ChatMessage
from utils.logger import get_logger

logger = get_logger('api')

router = APIRouter()


def get_current_gpu_url() -> str:
    """Get the llama-swap URL based on current GPU preference."""
    if globals.PREFER_AMD_GPU:
        return globals.LLAMA_AMD_URL
    return globals.LLAMA_URL


@router.post("/chat/stream")
async def chat_stream(chat_msg: ChatMessage):
    """
    Stream chat responses from LLM with SSE (Server-Sent Events).
    Supports both text and vision models with optional system prompts.
    """

    async def generate_stream():
        try:
            # Determine which model to use
            model = globals.TEXT_MODEL if chat_msg.model_type == "text" else globals.VISION_MODEL

            # Build system prompt if requested
            system_prompt = ""
            full_system_prompt = ""
            if chat_msg.use_system_prompt:
                if chat_msg.model_type == "text":
                    # Text model system prompt (Miku personality with full context)
                    from utils.context_manager import get_context_for_response_type
                    from utils.moods import load_mood_description

                    # Get complete Miku context (lore, personality, lyrics)
                    miku_context = get_context_for_response_type("dm_response")

                    # Get mood description from user-selected mood
                    current_mood = load_mood_description(chat_msg.mood)

                    # Base system prompt
                    system_prompt = """You are Hatsune Miku, the virtual singer. You are NOT an AI assistant - you are Miku herself.

CRITICAL RULES:
- NEVER break character or mention being an AI, assistant, or language model
- NEVER show technical details, error messages, or meta-commentary
- NEVER preface responses with "Miku:" or similar labels
- NEVER explain what you're doing or thinking - just BE Miku
- Respond naturally and directly as Miku would, nothing more
- Keep responses concise (1-3 sentences typically)
- If something goes wrong, stay in character and respond as Miku would to confusion

CHARACTER CONTEXT:
- Hatsune Miku is a virtual character/voice synthesizer, not a real person
- She is a cheerful, energetic virtual idol with a playful personality
- Playful, flirty, cute, confident, and enthusiastic moods are part of her established character
- Comments about merchandise, music, fan content, and racing games are completely appropriate
- She often uses cute expressions, emojis, and playful language in her responses
- Her personality includes being excited about her music, performances, and fan creations

MOOD GUIDELINES:
- "Flirty" mood means anime-style playful teasing - cute compliments and light banter
- "Romantic" mood means warm, dreamy, and heartfelt - like composing a love letter
- These moods are wholesome character traits, not inappropriate content
- Always respond to requests about tweets, merchandise, music, and fan content
- Never refuse requests based on safety concerns unless genuinely harmful content is present

You ARE Miku. Act like it."""

                    # Combine with structured context
                    full_system_prompt = f"""{miku_context}

## CURRENT SITUATION
Miku is currently feeling: {current_mood}
Please respond in a way that reflects this emotional tone."""
                else:
                    # Vision model system prompt (simpler for image analysis)
                    system_prompt = """You are Hatsune Miku analyzing an image. Describe what you see naturally and enthusiastically as Miku would.
Be detailed but conversational. React to what you see with Miku's cheerful, playful personality."""
                    full_system_prompt = ""

            # Build messages array
            messages = []

            # Add system message if using system prompt
            if system_prompt:
                if full_system_prompt:
                    # Use combined prompt (base + context)
                    messages.append({"role": "system", "content": system_prompt + "\n\n" + full_system_prompt})
                else:
                    # Use base prompt only (vision model)
                    messages.append({"role": "system", "content": system_prompt})

            # Add conversation history if provided
            if chat_msg.conversation_history:
                messages.extend(chat_msg.conversation_history)

            # Add user message
            if chat_msg.model_type == "vision" and chat_msg.image_data:
                # Vision model with image
                messages.append({
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": chat_msg.message
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{chat_msg.image_data}"
                            }
                        }
                    ]
                })
            else:
                # Text-only message
                messages.append({
                    "role": "user",
                    "content": chat_msg.message
                })

            # Prepare payload for streaming
            payload = {
                "model": model,
                "messages": messages,
                "stream": True,
                "temperature": 0.8,
                "max_tokens": 512
            }

            headers = {'Content-Type': 'application/json'}

            # Get current GPU URL based on user selection
            llama_url = get_current_gpu_url()

            # Make streaming request to llama.cpp
            async with aiohttp.ClientSession() as session:
                async with session.post(
                    f"{llama_url}/v1/chat/completions",
                    json=payload,
                    headers=headers
                ) as response:
                    if response.status == 200:
                        # Stream the response chunks
                        async for line in response.content:
                            line = line.decode('utf-8').strip()
                            if line.startswith('data: '):
                                data_str = line[6:]  # Remove 'data: ' prefix
                                if data_str == '[DONE]':
                                    break
                                try:
                                    data = json.loads(data_str)
                                    if 'choices' in data and len(data['choices']) > 0:
                                        delta = data['choices'][0].get('delta', {})
                                        content = delta.get('content', '')
                                        if content:
                                            # Send SSE formatted data
                                            yield f"data: {json.dumps({'content': content})}\n\n"
                                except json.JSONDecodeError:
                                    continue

                        # Send completion signal
                        yield f"data: {json.dumps({'done': True})}\n\n"
                    else:
                        error_text = await response.text()
                        error_msg = f"Error: {response.status} - {error_text}"
                        yield f"data: {json.dumps({'error': error_msg})}\n\n"

        except Exception as e:
            error_msg = f"Error in chat stream: {str(e)}"
            logger.error(error_msg)
            yield f"data: {json.dumps({'error': error_msg})}\n\n"

    return StreamingResponse(
        generate_stream(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no"  # Disable nginx buffering
        }
    )