Files
miku-discord/llama-swap-rocm-config.yaml
koko210Serve 9d2c14fa0b Fix vision pipeline: ffmpeg removal by autoremove, increase vision timeout, reduce frame count, add Discord activity awareness
- bot/Dockerfile: Add ffmpeg to reinstall line after apt-get autoremove
  (autoremove was sweeping up ffmpeg as 'no longer needed' after playwright install)
- bot/utils/image_handling.py: Increase video analysis timeout 120s→300s, 6→3 for Tenor GIFs (GTX 1660 VRAM constraint)
- bot/utils/activities.py: Add _activity_changed_at timestamp tracking,
  get_current_activity_label() and get_current_activity_fresh() with 30-min decay
- bot/utils/cat_client.py: Pass current Discord activity to Cheshire Cat pipeline
- bot/utils/llm.py: Inject current Discord activity into system prompt
- cat-plugins/*: Forward Discord activity through working_memory to personality plugins
- bot/persona/*/preamble.txt: Add Discord status usage guidelines for character prompts
- llama-swap-rocm-config.yaml: Add qwen3.5 model entry for ComfyUI prompt generation
- AGENTS.md: New project documentation file
2026-05-27 01:18:12 +03:00

53 lines
2.2 KiB
YAML

# llama-swap configuration for AMD RX 6800 (ROCm)
# This manages automatic model switching and unloading for the secondary GPU
# Vision model stays on NVIDIA GPU - AMD only handles text models
models:
# Main text generation model (same name as NVIDIA for uniform switching)
# Custom chat template to disable built-in tool calling
llama3.1:
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
aliases:
- llama3.1
- text-model
# Evil/Uncensored model (same name as NVIDIA for uniform switching)
darkidol:
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
ttl: 1800 # Unload after 30 minutes of inactivity
aliases:
- darkidol
- evil-model
- uncensored
# Rocinante-X 12B - larger creative/RP model for comparison testing
rocinante:
cmd: /app/llama-server --port ${PORT} --model /models/Rocinante-X-12B-v1b-Q5_K_M.gguf -ngl 99 -c 8192 --host 0.0.0.0 --no-warmup --flash-attn on
ttl: 1800 # Unload after 30 minutes of inactivity
aliases:
- rocinante
- rocinante-12b
# Japanese language model (Llama 3.1 Swallow - Japanese optimized)
swallow:
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
ttl: 1800 # Unload after 30 minutes of inactivity
aliases:
- swallow
- japanese
- japanese-model
# Qwen3.5 for ComfyUI prompt generation
qwen3.5:
cmd: /app/llama-server --port ${PORT} --model /models/Gemma-4-E4B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf -ngl 99 -c 8192 --host 0.0.0.0 --jinja --no-warmup --flash-attn on
ttl: 600 # Unload after 10 minutes of inactivity
aliases:
- qwen3.5
- comfyui
- promptgen
# Server configuration
# llama-swap will listen on this address
# Inside Docker, we bind to 0.0.0.0 to allow bot container to connect