Add dual GPU support with web UI selector
Features: - Built custom ROCm container for AMD RX 6800 GPU - Added GPU selection toggle in web UI (NVIDIA/AMD) - Unified model names across both GPUs for seamless switching - Vision model always uses NVIDIA GPU (optimal performance) - Text models (llama3.1, darkidol) can use either GPU - Added /gpu-status and /gpu-select API endpoints - Implemented GPU state persistence in memory/gpu_state.json Technical details: - Multi-stage Dockerfile.llamaswap-rocm with ROCm 6.2.4 - llama.cpp compiled with GGML_HIP=ON for gfx1030 (RX 6800) - Proper GPU permissions without root (groups 187/989) - AMD container on port 8091, NVIDIA on port 8090 - Updated bot/utils/llm.py with get_current_gpu_url() and get_vision_gpu_url() - Modified bot/utils/image_handling.py to always use NVIDIA for vision - Enhanced web UI with GPU selector button (blue=NVIDIA, red=AMD) Files modified: - docker-compose.yml (added llama-swap-amd service) - bot/globals.py (added LLAMA_AMD_URL) - bot/api.py (added GPU selection endpoints and helper function) - bot/utils/llm.py (GPU routing for text models) - bot/utils/image_handling.py (GPU routing for vision models) - bot/static/index.html (GPU selector UI) - llama-swap-rocm-config.yaml (unified model names) New files: - Dockerfile.llamaswap-rocm - bot/memory/gpu_state.json - bot/utils/gpu_router.py (load balancing utility) - setup-dual-gpu.sh (setup verification script) - DUAL_GPU_*.md (documentation files)
This commit is contained in:
191
bot/utils/gpu_router.py
Normal file
191
bot/utils/gpu_router.py
Normal file
@@ -0,0 +1,191 @@
|
||||
"""
|
||||
GPU Router Utility for Dual GPU Setup
|
||||
Manages routing between NVIDIA and AMD GPUs for model inference
|
||||
"""
|
||||
|
||||
import os
|
||||
import random
|
||||
import logging
|
||||
from typing import Optional, Literal
|
||||
|
||||
import globals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Model to GPU mapping
|
||||
MODEL_TO_GPU = {
|
||||
# NVIDIA models (primary GPU)
|
||||
"llama3.1": globals.LLAMA_URL,
|
||||
"text-model": globals.LLAMA_URL,
|
||||
"darkidol": globals.LLAMA_URL,
|
||||
"evil-model": globals.LLAMA_URL,
|
||||
"uncensored": globals.LLAMA_URL,
|
||||
"vision": globals.LLAMA_URL,
|
||||
"vision-model": globals.LLAMA_URL,
|
||||
"minicpm": globals.LLAMA_URL,
|
||||
|
||||
# AMD models (secondary GPU - RX 6800)
|
||||
"llama3.1-amd": globals.LLAMA_AMD_URL,
|
||||
"text-model-amd": globals.LLAMA_AMD_URL,
|
||||
"amd-text": globals.LLAMA_AMD_URL,
|
||||
"darkidol-amd": globals.LLAMA_AMD_URL,
|
||||
"evil-model-amd": globals.LLAMA_AMD_URL,
|
||||
"uncensored-amd": globals.LLAMA_AMD_URL,
|
||||
"moondream-amd": globals.LLAMA_AMD_URL,
|
||||
"vision-amd": globals.LLAMA_AMD_URL,
|
||||
"moondream": globals.LLAMA_AMD_URL,
|
||||
}
|
||||
|
||||
# Configuration
|
||||
PREFER_AMD_GPU = os.getenv("PREFER_AMD_GPU", "false").lower() == "true"
|
||||
AMD_MODELS_ENABLED = os.getenv("AMD_MODELS_ENABLED", "true").lower() == "true"
|
||||
|
||||
|
||||
def get_endpoint_for_model(model_name: str) -> str:
|
||||
"""
|
||||
Get the correct llama-swap endpoint for a model.
|
||||
|
||||
Args:
|
||||
model_name: Name or alias of the model
|
||||
|
||||
Returns:
|
||||
URL of the llama-swap endpoint (either NVIDIA or AMD)
|
||||
"""
|
||||
endpoint = MODEL_TO_GPU.get(model_name, globals.LLAMA_URL)
|
||||
|
||||
# If AMD models are disabled, use NVIDIA for AMD models too
|
||||
if not AMD_MODELS_ENABLED and endpoint == globals.LLAMA_AMD_URL:
|
||||
logger.warning(f"AMD GPU disabled, routing {model_name} to NVIDIA GPU")
|
||||
# Map AMD model name to NVIDIA equivalent
|
||||
nvidia_model = model_name.replace("-amd", "")
|
||||
endpoint = globals.LLAMA_URL
|
||||
|
||||
return endpoint
|
||||
|
||||
|
||||
def is_amd_model(model_name: str) -> bool:
|
||||
"""
|
||||
Check if a model runs on the AMD GPU.
|
||||
|
||||
Args:
|
||||
model_name: Name or alias of the model
|
||||
|
||||
Returns:
|
||||
True if model runs on AMD GPU, False otherwise
|
||||
"""
|
||||
return model_name.endswith("-amd") or model_name in ["moondream", "moondream-amd", "vision-amd"]
|
||||
|
||||
|
||||
def get_llama_url_with_load_balancing(
|
||||
prefer_amd: bool = False,
|
||||
task_type: Literal["text", "vision", "evil"] = "text"
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Get llama URL with optional load balancing between GPUs.
|
||||
Returns both URL and recommended model name.
|
||||
|
||||
Args:
|
||||
prefer_amd: If True, prefer AMD GPU when possible
|
||||
task_type: Type of task (text, vision, or evil)
|
||||
|
||||
Returns:
|
||||
Tuple of (url, model_name)
|
||||
"""
|
||||
if not AMD_MODELS_ENABLED:
|
||||
# AMD disabled, use NVIDIA only
|
||||
if task_type == "evil":
|
||||
return globals.LLAMA_URL, "darkidol"
|
||||
elif task_type == "vision":
|
||||
return globals.LLAMA_URL, "vision"
|
||||
else:
|
||||
return globals.LLAMA_URL, "llama3.1"
|
||||
|
||||
# AMD enabled - implement load balancing
|
||||
use_amd = prefer_amd or PREFER_AMD_GPU or (random.random() < 0.5)
|
||||
|
||||
if task_type == "evil":
|
||||
# Evil/uncensored models
|
||||
if use_amd:
|
||||
return globals.LLAMA_AMD_URL, "darkidol-amd"
|
||||
else:
|
||||
return globals.LLAMA_URL, "darkidol"
|
||||
|
||||
elif task_type == "vision":
|
||||
# Vision models - MiniCPM on NVIDIA, Moondream on AMD
|
||||
if use_amd:
|
||||
return globals.LLAMA_AMD_URL, "moondream-amd"
|
||||
else:
|
||||
return globals.LLAMA_URL, "vision"
|
||||
|
||||
else:
|
||||
# Text generation - round robin between GPUs
|
||||
if use_amd:
|
||||
return globals.LLAMA_AMD_URL, "llama3.1-amd"
|
||||
else:
|
||||
return globals.LLAMA_URL, "llama3.1"
|
||||
|
||||
|
||||
def get_vision_model_for_gpu(prefer_amd: bool = False) -> tuple[str, str]:
|
||||
"""
|
||||
Get the appropriate vision model based on GPU preference.
|
||||
|
||||
Args:
|
||||
prefer_amd: If True, use AMD GPU vision model
|
||||
|
||||
Returns:
|
||||
Tuple of (url, model_name)
|
||||
"""
|
||||
if prefer_amd and AMD_MODELS_ENABLED:
|
||||
return globals.LLAMA_AMD_URL, "moondream-amd"
|
||||
else:
|
||||
return globals.LLAMA_URL, "vision"
|
||||
|
||||
|
||||
def get_text_model_for_gpu(prefer_amd: bool = False) -> tuple[str, str]:
|
||||
"""
|
||||
Get the appropriate text model based on GPU preference.
|
||||
|
||||
Args:
|
||||
prefer_amd: If True, use AMD GPU text model
|
||||
|
||||
Returns:
|
||||
Tuple of (url, model_name)
|
||||
"""
|
||||
if prefer_amd and AMD_MODELS_ENABLED:
|
||||
return globals.LLAMA_AMD_URL, "llama3.1-amd"
|
||||
else:
|
||||
return globals.LLAMA_URL, "llama3.1"
|
||||
|
||||
|
||||
def log_gpu_routing(model_name: str, endpoint: str, task_type: str = "inference"):
|
||||
"""
|
||||
Log GPU routing decision for debugging.
|
||||
|
||||
Args:
|
||||
model_name: Name of the model being used
|
||||
endpoint: URL endpoint being used
|
||||
task_type: Type of task being performed
|
||||
"""
|
||||
gpu_type = "AMD RX 6800" if endpoint == globals.LLAMA_AMD_URL else "NVIDIA"
|
||||
logger.info(f"[GPU Router] {task_type} - Using {model_name} on {gpu_type} ({endpoint})")
|
||||
|
||||
|
||||
# Example usage in bot code:
|
||||
"""
|
||||
# Simple routing by model name
|
||||
url = get_endpoint_for_model("llama3.1-amd")
|
||||
|
||||
# Load balanced routing
|
||||
url, model = get_llama_url_with_load_balancing(task_type="text")
|
||||
response = requests.post(
|
||||
f"{url}/v1/chat/completions",
|
||||
json={"model": model, ...}
|
||||
)
|
||||
|
||||
# Vision model with GPU preference
|
||||
url, model = get_vision_model_for_gpu(prefer_amd=True)
|
||||
|
||||
# With logging
|
||||
url = get_endpoint_for_model("darkidol-amd")
|
||||
log_gpu_routing("darkidol-amd", url, "evil mode generation")
|
||||
"""
|
||||
Reference in New Issue
Block a user