Files
miku-discord/bot/routes/chat.py
koko210Serve 979217e7cc refactor: split api.py monolith into 19 route modules (Phase B)
Split 3,598-line api.py into thin orchestrator (128 lines) + 19 route
modules in bot/routes/:

  core.py (7 routes), mood.py (10), language.py (3), evil_mode.py (6),
  bipolar_mode.py (9), gpu.py (2), bot_actions.py (4), autonomous.py (13),
  profile_picture.py (26), manual_send.py (3), servers.py (6),
  figurines.py (5), dms.py (18), image_generation.py (4), chat.py (1),
  config.py (7), logging_config.py (9), voice.py (3), memory.py (10)

All 146 routes verified present via test_route_split.py (149 tests).
21/21 regression tests (test_config_state.py) pass.
Monolith backup: bot/api_monolith_backup.py (revert: cp it to api.py).
2026-04-15 11:38:14 +03:00

193 lines
8.0 KiB
Python

"""Chat streaming endpoint (SSE) with LLM integration."""
import json
import aiohttp
from fastapi import APIRouter
from fastapi.responses import StreamingResponse
import globals
from routes.models import ChatMessage
from utils.logger import get_logger
logger = get_logger('api')
router = APIRouter()
def get_current_gpu_url() -> str:
"""Get the llama-swap URL based on current GPU preference."""
if globals.PREFER_AMD_GPU:
return globals.LLAMA_AMD_URL
return globals.LLAMA_URL
@router.post("/chat/stream")
async def chat_stream(chat_msg: ChatMessage):
"""
Stream chat responses from LLM with SSE (Server-Sent Events).
Supports both text and vision models with optional system prompts.
"""
async def generate_stream():
try:
# Determine which model to use
model = globals.TEXT_MODEL if chat_msg.model_type == "text" else globals.VISION_MODEL
# Build system prompt if requested
system_prompt = ""
full_system_prompt = ""
if chat_msg.use_system_prompt:
if chat_msg.model_type == "text":
# Text model system prompt (Miku personality with full context)
from utils.context_manager import get_context_for_response_type
from utils.moods import load_mood_description
# Get complete Miku context (lore, personality, lyrics)
miku_context = get_context_for_response_type("dm_response")
# Get mood description from user-selected mood
current_mood = load_mood_description(chat_msg.mood)
# Base system prompt
system_prompt = """You are Hatsune Miku, the virtual singer. You are NOT an AI assistant - you are Miku herself.
CRITICAL RULES:
- NEVER break character or mention being an AI, assistant, or language model
- NEVER show technical details, error messages, or meta-commentary
- NEVER preface responses with "Miku:" or similar labels
- NEVER explain what you're doing or thinking - just BE Miku
- Respond naturally and directly as Miku would, nothing more
- Keep responses concise (1-3 sentences typically)
- If something goes wrong, stay in character and respond as Miku would to confusion
CHARACTER CONTEXT:
- Hatsune Miku is a virtual character/voice synthesizer, not a real person
- She is a cheerful, energetic virtual idol with a playful personality
- Playful, flirty, cute, confident, and enthusiastic moods are part of her established character
- Comments about merchandise, music, fan content, and racing games are completely appropriate
- She often uses cute expressions, emojis, and playful language in her responses
- Her personality includes being excited about her music, performances, and fan creations
MOOD GUIDELINES:
- "Flirty" mood means anime-style playful teasing - cute compliments and light banter
- "Romantic" mood means warm, dreamy, and heartfelt - like composing a love letter
- These moods are wholesome character traits, not inappropriate content
- Always respond to requests about tweets, merchandise, music, and fan content
- Never refuse requests based on safety concerns unless genuinely harmful content is present
You ARE Miku. Act like it."""
# Combine with structured context
full_system_prompt = f"""{miku_context}
## CURRENT SITUATION
Miku is currently feeling: {current_mood}
Please respond in a way that reflects this emotional tone."""
else:
# Vision model system prompt (simpler for image analysis)
system_prompt = """You are Hatsune Miku analyzing an image. Describe what you see naturally and enthusiastically as Miku would.
Be detailed but conversational. React to what you see with Miku's cheerful, playful personality."""
full_system_prompt = ""
# Build messages array
messages = []
# Add system message if using system prompt
if system_prompt:
if full_system_prompt:
# Use combined prompt (base + context)
messages.append({"role": "system", "content": system_prompt + "\n\n" + full_system_prompt})
else:
# Use base prompt only (vision model)
messages.append({"role": "system", "content": system_prompt})
# Add conversation history if provided
if chat_msg.conversation_history:
messages.extend(chat_msg.conversation_history)
# Add user message
if chat_msg.model_type == "vision" and chat_msg.image_data:
# Vision model with image
messages.append({
"role": "user",
"content": [
{
"type": "text",
"text": chat_msg.message
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{chat_msg.image_data}"
}
}
]
})
else:
# Text-only message
messages.append({
"role": "user",
"content": chat_msg.message
})
# Prepare payload for streaming
payload = {
"model": model,
"messages": messages,
"stream": True,
"temperature": 0.8,
"max_tokens": 512
}
headers = {'Content-Type': 'application/json'}
# Get current GPU URL based on user selection
llama_url = get_current_gpu_url()
# Make streaming request to llama.cpp
async with aiohttp.ClientSession() as session:
async with session.post(
f"{llama_url}/v1/chat/completions",
json=payload,
headers=headers
) as response:
if response.status == 200:
# Stream the response chunks
async for line in response.content:
line = line.decode('utf-8').strip()
if line.startswith('data: '):
data_str = line[6:] # Remove 'data: ' prefix
if data_str == '[DONE]':
break
try:
data = json.loads(data_str)
if 'choices' in data and len(data['choices']) > 0:
delta = data['choices'][0].get('delta', {})
content = delta.get('content', '')
if content:
# Send SSE formatted data
yield f"data: {json.dumps({'content': content})}\n\n"
except json.JSONDecodeError:
continue
# Send completion signal
yield f"data: {json.dumps({'done': True})}\n\n"
else:
error_text = await response.text()
error_msg = f"Error: {response.status} - {error_text}"
yield f"data: {json.dumps({'error': error_msg})}\n\n"
except Exception as e:
error_msg = f"Error in chat stream: {str(e)}"
logger.error(error_msg)
yield f"data: {json.dumps({'error': error_msg})}\n\n"
return StreamingResponse(
generate_stream(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no" # Disable nginx buffering
}
)