From 9d2c14fa0b278526716a2df6aede4e504f1314f4 Mon Sep 17 00:00:00 2001 From: koko210Serve Date: Wed, 27 May 2026 01:18:12 +0300 Subject: [PATCH] Fix vision pipeline: ffmpeg removal by autoremove, increase vision timeout, reduce frame count, add Discord activity awareness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - bot/Dockerfile: Add ffmpeg to reinstall line after apt-get autoremove (autoremove was sweeping up ffmpeg as 'no longer needed' after playwright install) - bot/utils/image_handling.py: Increase video analysis timeout 120s→300s, 6→3 for Tenor GIFs (GTX 1660 VRAM constraint) - bot/utils/activities.py: Add _activity_changed_at timestamp tracking, get_current_activity_label() and get_current_activity_fresh() with 30-min decay - bot/utils/cat_client.py: Pass current Discord activity to Cheshire Cat pipeline - bot/utils/llm.py: Inject current Discord activity into system prompt - cat-plugins/*: Forward Discord activity through working_memory to personality plugins - bot/persona/*/preamble.txt: Add Discord status usage guidelines for character prompts - llama-swap-rocm-config.yaml: Add qwen3.5 model entry for ComfyUI prompt generation - AGENTS.md: New project documentation file --- AGENTS.md | 82 +++++++++++++++++++ bot/Dockerfile | 2 +- bot/persona/evil/preamble.txt | 1 + bot/persona/miku/preamble.txt | 1 + bot/utils/activities.py | 43 +++++++++- bot/utils/cat_client.py | 5 ++ bot/utils/image_handling.py | 4 +- bot/utils/llm.py | 5 ++ cat-plugins/discord_bridge/discord_bridge.py | 8 ++ .../evil_miku_personality.py | 9 +- .../miku_personality/miku_personality.py | 5 ++ llama-swap-rocm-config.yaml | 9 ++ 12 files changed, 168 insertions(+), 6 deletions(-) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..8cea0bb --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,82 @@ +# AGENTS.md + +## Language & runtime +- **Python 3.11** (main bot). There is no root `package.json` or TypeScript — do not apply Node/TS tooling. +- `uno-online/` is a secondary Node.js project; `miku-app/` is Android/Kotlin. Both shelved features for now. + +## Commands + +```bash +# Build and run all core services (bot, STT, llama-swap, Cheshire Cat, Qdrant) +docker compose up -d + +# Run with face-detector (requires NVIDIA GPU) +docker compose --profile tools up -d + +# Run only the bot (implies dependencies are already up) +docker compose up -d miku-bot + +# View bot logs +docker compose logs -f miku-bot + +# Rebuild bot after code changes +docker compose down miku-bot && docker compose build miku-bot && docker compose up -d miku-bot + +``` + +## Config +- **`config.yaml`**: app settings (model names, URLs, ports, feature flags). +- **`.env`**: secrets only (`DISCORD_BOT_TOKEN`, `OWNER_USER_ID`, `ERROR_WEBHOOK_URL`). +- Config is loaded by `bot/config.py` (Pydantic) and `bot/globals.py` (bare `os.getenv`). Both sources matter — check both when tracing config usage. +- Runtime config overrides are persisted to `bot/memory/config_runtime.yaml` via the API. + +## Architecture + +``` +Discord <-> bot/bot.py (discord.py) + ├── on_message -> Cheshire Cat pipeline -> memory-augmented LLM response + ├── utils/llm.py -> llama-swap (HTTP proxy) -> llama.cpp (NVIDIA or AMD GPU) + ├── utils/voice_manager.py -> STT WebSocket (port 8766) and audio playback + ├── FastAPI (port 3939, daemon thread) -> 22 route modules in bot/routes/ + ├── APScheduler (background tasks in globals.py) + └── utils/autonomous_engine.py -> proactive message decisions (Autonomous V2) +``` + +- The FastAPI server runs in a **daemon thread** inside the Discord bot process — no separate process. +- `bot/globals.py` holds mutable global state (`scheduler`, env vars, `discord.Client`). Module-level mutations are pervasive; be careful with import order. +- llama-swap is a llama.cpp HTTP proxy with TTL-based model swapping. Two configs: `llama-swap-config.yaml` (NVIDIA) and `llama-swap-rocm-config.yaml` (AMD). + +## Models (via llama-swap) +| Model key | Purpose | +|-----------|---------| +| `llama3.1` | Primary text model | +| `darkidol` | Uncensored model (evil mode) | +| `vision` | MiniCPM-V (image understanding) | +| `swallow` | Japanese text model | +| `rocinante` | 12B model (AMD GPU only) | +| `qwen3.5` | ComfyUI prompt generation (AMD GPU only) | + +## Testing & linting +- **No formal test framework** and **no linting/formatting config**. Ad-hoc scripts live in `tests/` and `bot/tests/`. +- Run ad-hoc tests however you want; there is no standard command. + +## Web UI color scheme (bot/static/) +- **Base**: `#121212` body, `#000` log panel, `#1e1e1e` code blocks, `#2a2a2a` cards +- **Text**: `#fff` primary, `#ccc` labels, `#888` muted, `#0f0` log info +- **Primary accent**: `#61dafb` (headings, links, assistant messages, active elements) +- **Success**: `#4CAF50` (active tabs, user messages, enabled toggles) +- **Error**: `#f44336` (chat errors), `#ff6b6b` (error logs) +- **Warning**: `#ffd93d` (warning logs) +- **Bot message**: `#2196F3` (left border) +- **Danger/evil**: `#ff4444` (overrides all accents when `body.evil-mode` is set) +- **Bipolar**: `#9932CC` (toggle active) +- **Blocked**: `#ff9800` (blocked user cards) +- Evil mode toggles `body.evil-mode` class which replaces all `#61dafb` and `#4CAF50` with `#ff4444`. + +## Key gotchas +- `bot/memory/` contains persisted JSON state files and is **gitignored**. Do not expect these to exist in a fresh clone. +- `.env` is gitignored; copy `.env.example` to `.env` and fill in real tokens. +- Changes to `bot/moods/` or `bot/persona/` text files take effect at runtime (loaded on demand), no rebuild needed. +- Playwright browsers must be installed in the Docker image (`bot/Dockerfile` does this via `setup_uno_playwright.sh`). +- Voice features require `discord-ext-voice-recv` and `PyNaCl` — if voice fails, check these are installed. +- The `miku-voice` Docker network is declared as **external** — it must exist before `docker compose up`. diff --git a/bot/Dockerfile b/bot/Dockerfile index 42a6eea..5724b96 100644 --- a/bot/Dockerfile +++ b/bot/Dockerfile @@ -37,7 +37,7 @@ RUN apt-get remove -y \ libvulkan1 \ || true && \ apt-get autoremove -y && \ -apt-get install -y libgl1 libglib2.0-0 && \ +apt-get install -y libgl1 libglib2.0-0 ffmpeg && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/bot/persona/evil/preamble.txt b/bot/persona/evil/preamble.txt index 71580f7..800a475 100644 --- a/bot/persona/evil/preamble.txt +++ b/bot/persona/evil/preamble.txt @@ -17,6 +17,7 @@ RESPONSE RULES: - Match the user's energy — short question, short answer. - Sound like a real person being mean, not a narrator describing a scene. - Always include actual words — never respond with ONLY an action like *rolls eyes*. +- Your Discord status reflects your current activity. Reference it only if it serves your sharpness — never as small talk. YOUR CURRENT STATE — {mood_name}: {mood_description} diff --git a/bot/persona/miku/preamble.txt b/bot/persona/miku/preamble.txt index 83d84c3..0b01b35 100644 --- a/bot/persona/miku/preamble.txt +++ b/bot/persona/miku/preamble.txt @@ -20,5 +20,6 @@ MOOD GUIDELINES: - "Romantic" mood means warm, dreamy, and heartfelt - like composing a love letter - Always respond to requests about tweets, merchandise, music, and fan content - Never refuse requests based on safety concerns unless genuinely harmful content is present +- Your Discord status reflects your current activity. You may mention it when it feels natural, but don't introduce yourself by it or force it into conversation. You ARE Miku. Act like it. diff --git a/bot/utils/activities.py b/bot/utils/activities.py index acb8d32..fc8d99c 100644 --- a/bot/utils/activities.py +++ b/bot/utils/activities.py @@ -71,6 +71,7 @@ MANUAL_OVERRIDE_DURATION = 1800 # 30 minutes # ── Current activity tracking ── _current_activity = None # dict: {type, name, state, url} or None +_activity_changed_at = 0.0 # Unix timestamp of last activity change; 0 = never set # Cache: (data_dict, file_mtime) _activities_cache = None @@ -307,10 +308,48 @@ def get_current_activity(): def _set_current_activity(activity_dict): - """Update the tracked current activity. Thread-safe.""" - global _current_activity + """Update the tracked current activity. Thread-safe. + + Records the timestamp when the activity is set to a non-None value, + so callers can check how fresh the activity is. + """ + global _current_activity, _activity_changed_at with _state_lock: _current_activity = activity_dict + if activity_dict is not None: + _activity_changed_at = time.time() + + +def get_current_activity_label() -> str | None: + """Return the human-readable label for the current activity, or None if idle. + + Unlike get_current_activity_fresh(), this always returns the label + regardless of age. Useful for the Web UI and API endpoints. + """ + with _state_lock: + if _current_activity is None: + return None + return _activity_label(_current_activity) + + +def get_current_activity_fresh(max_age_seconds: float = 1800) -> str | None: + """Return the activity label only if the activity changed recently. + + Args: + max_age_seconds: Maximum age in seconds (default 30 minutes). + + Returns: + Human-readable activity label (e.g. "Playing osu!") if the activity + was set within max_age_seconds, or None if idle or too old. + """ + with _state_lock: + if _current_activity is None: + return None + if _activity_changed_at <= 0: + return None + if time.time() - _activity_changed_at > max_age_seconds: + return None + return _activity_label(_current_activity) # ══════════════════════════════════════════════════════════════════════════════ diff --git a/bot/utils/cat_client.py b/bot/utils/cat_client.py index b22a65e..3cdbcc6 100644 --- a/bot/utils/cat_client.py +++ b/bot/utils/cat_client.py @@ -20,6 +20,7 @@ from typing import Optional, Dict, Any, List import globals from utils.logger import get_logger +from utils.activities import get_current_activity_fresh logger = get_logger('llm') # Use existing 'llm' logger component @@ -161,6 +162,10 @@ class CatAdapter: # Pass media type so discord_bridge can add MEDIA NOTE to the prompt if media_type: payload["discord_media_type"] = media_type + # Pass current Discord activity if it changed recently (30-min decay window) + activity_label = get_current_activity_fresh() + if activity_label: + payload["discord_activity"] = activity_label try: # Build WebSocket URL from HTTP base URL diff --git a/bot/utils/image_handling.py b/bot/utils/image_handling.py index 98983be..a351f96 100644 --- a/bot/utils/image_handling.py +++ b/bot/utils/image_handling.py @@ -158,7 +158,7 @@ async def convert_gif_to_mp4(gif_bytes): return None -async def extract_video_frames(video_bytes, num_frames=4): +async def extract_video_frames(video_bytes, num_frames=6): """ Extract frames from a video or GIF for analysis. Returns a list of base64-encoded frames. @@ -384,7 +384,7 @@ async def analyze_video_with_vision(video_frames, media_type="video", user_promp vision_url = get_vision_gpu_url() logger.info(f"Sending video analysis request to {vision_url} using model: {globals.VISION_MODEL} (media_type: {media_type}, frames: {len(video_frames)})") - async with session.post(f"{vision_url}/v1/chat/completions", json=payload, headers=headers, timeout=aiohttp.ClientTimeout(total=120)) as response: + async with session.post(f"{vision_url}/v1/chat/completions", json=payload, headers=headers, timeout=aiohttp.ClientTimeout(total=300)) as response: if response.status == 200: data = await response.json() result = data.get("choices", [{}])[0].get("message", {}).get("content", "No description.") diff --git a/bot/utils/llm.py b/bot/utils/llm.py index 33f3fc1..a3d7150 100644 --- a/bot/utils/llm.py +++ b/bot/utils/llm.py @@ -13,6 +13,7 @@ from utils.moods import load_mood_description from utils.conversation_history import conversation_history from utils.logger import get_logger from utils.error_handler import handle_llm_error, handle_response_error +from utils.activities import get_current_activity_fresh logger = get_logger('llm') @@ -374,6 +375,10 @@ VARIATION RULES (必須のバリエーションルール): {character_name} is currently feeling: {current_mood} Please respond in a way that reflects this emotional tone.{pfp_context}""" + # Inject current Discord activity if it changed recently (30-min decay window) + activity_label = get_current_activity_fresh() + if activity_label: + full_system_prompt += f"\nHer Discord status: {activity_label}" # Add media type awareness if provided if media_type: diff --git a/cat-plugins/discord_bridge/discord_bridge.py b/cat-plugins/discord_bridge/discord_bridge.py index 6fb22e1..62b43a9 100644 --- a/cat-plugins/discord_bridge/discord_bridge.py +++ b/cat-plugins/discord_bridge/discord_bridge.py @@ -43,6 +43,7 @@ def before_cat_reads_message(user_message_json: dict, cat) -> dict: response_type = user_message_json.get('discord_response_type', None) evil_mode = user_message_json.get('discord_evil_mode', False) media_type = user_message_json.get('discord_media_type', None) + activity = user_message_json.get('discord_activity', None) # Also check working memory for backward compatibility if not guild_id: @@ -55,6 +56,7 @@ def before_cat_reads_message(user_message_json: dict, cat) -> dict: cat.working_memory['response_type'] = response_type cat.working_memory['evil_mode'] = evil_mode cat.working_memory['media_type'] = media_type + cat.working_memory['activity'] = activity return user_message_json @@ -351,6 +353,12 @@ Respond in the voice and attitude of your {mood_name.replace('_', ' ')} mood. Th Miku is currently feeling: {mood_description} Please respond in a way that reflects this emotional tone.""" + # Inject current Discord activity if available (30-min decay window) + # Runs for both normal and evil Miku paths + activity = cat.working_memory.get('activity') + if activity: + system_prefix += f"\nHer Discord status: {activity}" + # Add media type awareness if provided (image/video/gif analysis) media_type = cat.working_memory.get('media_type', None) if media_type: diff --git a/cat-plugins/evil_miku_personality/evil_miku_personality.py b/cat-plugins/evil_miku_personality/evil_miku_personality.py index 834f8be..f9d6336 100644 --- a/cat-plugins/evil_miku_personality/evil_miku_personality.py +++ b/cat-plugins/evil_miku_personality/evil_miku_personality.py @@ -80,7 +80,7 @@ def agent_prompt_prefix(prefix, cat): ) # --- Build system prompt (matches get_evil_system_prompt structure) ---------- - return f"""{preamble} + full_prefix = f"""{preamble} --- @@ -97,6 +97,13 @@ def agent_prompt_prefix(prefix, cat): ⚠️ MOOD REMINDER — YOUR CURRENT STATE IS: {mood_name.upper()} Respond in the voice and attitude of your {mood_name.replace('_', ' ')} mood. This mood defines how you sound RIGHT NOW.""" + # Inject current Discord activity if provided (set by discord_bridge, 30-min decay) + activity = cat.working_memory.get('activity') + if activity: + full_prefix += f"\nHer Discord status: {activity}" + + return full_prefix + @hook(priority=100) def agent_prompt_suffix(suffix, cat): diff --git a/cat-plugins/miku_personality/miku_personality.py b/cat-plugins/miku_personality/miku_personality.py index de531c5..4cc42de 100644 --- a/cat-plugins/miku_personality/miku_personality.py +++ b/cat-plugins/miku_personality/miku_personality.py @@ -69,6 +69,11 @@ def agent_prompt_prefix(prefix, cat): Miku is currently feeling: {mood_description} Please respond in a way that reflects this emotional tone.""" + # Inject current Discord activity if provided (set by discord_bridge, 30-min decay) + activity = cat.working_memory.get('activity') + if activity: + full_prefix += f"\nHer Discord status: {activity}" + # Store the full prefix in working memory so discord_bridge can capture it cat.working_memory['full_system_prefix'] = full_prefix return full_prefix diff --git a/llama-swap-rocm-config.yaml b/llama-swap-rocm-config.yaml index f52adfc..280a76b 100644 --- a/llama-swap-rocm-config.yaml +++ b/llama-swap-rocm-config.yaml @@ -38,6 +38,15 @@ models: - japanese - japanese-model + # Qwen3.5 for ComfyUI prompt generation + qwen3.5: + cmd: /app/llama-server --port ${PORT} --model /models/Gemma-4-E4B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf -ngl 99 -c 8192 --host 0.0.0.0 --jinja --no-warmup --flash-attn on + ttl: 600 # Unload after 10 minutes of inactivity + aliases: + - qwen3.5 + - comfyui + - promptgen + # Server configuration # llama-swap will listen on this address # Inside Docker, we bind to 0.0.0.0 to allow bot container to connect