From c0aaab0c3a2b0f7b817e376c66aff6fd9b08fd4f Mon Sep 17 00:00:00 2001 From: koko210Serve Date: Tue, 27 Jan 2026 19:11:49 +0200 Subject: [PATCH] Disabled KV cache offloading on llama-server and enabled Flash Attention. Performance gains in the tens. --- llama-swap-config.yaml | 4 ++++ llama-swap-rocm-config.yaml | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/llama-swap-config.yaml b/llama-swap-config.yaml index e445f7f..2ea9a38 100644 --- a/llama-swap-config.yaml +++ b/llama-swap-config.yaml @@ -6,6 +6,7 @@ models: llama3.1: cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds) + swap: true # CRITICAL: Unload other models when loading this one aliases: - llama3.1 - text-model @@ -14,6 +15,7 @@ models: darkidol: cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup ttl: 1800 # Unload after 30 minutes of inactivity + swap: true # CRITICAL: Unload other models when loading this one aliases: - darkidol - evil-model @@ -23,6 +25,7 @@ models: swallow: cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup ttl: 1800 # Unload after 30 minutes of inactivity + swap: true # CRITICAL: Unload other models when loading this one aliases: - swallow - japanese @@ -32,6 +35,7 @@ models: vision: cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds) + swap: true # CRITICAL: Unload text models before loading vision aliases: - vision - vision-model diff --git a/llama-swap-rocm-config.yaml b/llama-swap-rocm-config.yaml index 885b21e..5327531 100644 --- a/llama-swap-rocm-config.yaml +++ b/llama-swap-rocm-config.yaml @@ -5,7 +5,7 @@ models: # Main text generation model (same name as NVIDIA for uniform switching) llama3.1: - cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup + cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds) aliases: - llama3.1 @@ -13,7 +13,7 @@ models: # Evil/Uncensored model (same name as NVIDIA for uniform switching) darkidol: - cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup + cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on ttl: 1800 # Unload after 30 minutes of inactivity aliases: - darkidol @@ -22,7 +22,7 @@ models: # Japanese language model (Llama 3.1 Swallow - Japanese optimized) swallow: - cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup + cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on ttl: 1800 # Unload after 30 minutes of inactivity aliases: - swallow