From c0aaab0c3a2b0f7b817e376c66aff6fd9b08fd4f Mon Sep 17 00:00:00 2001
From: koko210Serve <koko.yordanov@proton.me>
Date: Tue, 27 Jan 2026 19:11:49 +0200
Subject: [PATCH] Disabled KV cache offloading on llama-server and enabled
 Flash Attention. Performance gains in the tens.

---
 llama-swap-config.yaml      | 4 ++++
 llama-swap-rocm-config.yaml | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/llama-swap-config.yaml b/llama-swap-config.yaml
index e445f7f..2ea9a38 100644
--- a/llama-swap-config.yaml
+++ b/llama-swap-config.yaml
@@ -6,6 +6,7 @@ models:
   llama3.1:
     cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
     ttl: 1800  # Unload after 30 minutes of inactivity (1800 seconds)
+    swap: true  # CRITICAL: Unload other models when loading this one
     aliases:
       - llama3.1
       - text-model
@@ -14,6 +15,7 @@ models:
   darkidol:
     cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
     ttl: 1800  # Unload after 30 minutes of inactivity
+    swap: true  # CRITICAL: Unload other models when loading this one
     aliases:
       - darkidol
       - evil-model
@@ -23,6 +25,7 @@ models:
   swallow:
     cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
     ttl: 1800  # Unload after 30 minutes of inactivity
+    swap: true  # CRITICAL: Unload other models when loading this one
     aliases:
       - swallow
       - japanese
@@ -32,6 +35,7 @@ models:
   vision:
     cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup
     ttl: 900  # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
+    swap: true  # CRITICAL: Unload text models before loading vision
     aliases:
       - vision
       - vision-model
diff --git a/llama-swap-rocm-config.yaml b/llama-swap-rocm-config.yaml
index 885b21e..5327531 100644
--- a/llama-swap-rocm-config.yaml
+++ b/llama-swap-rocm-config.yaml
@@ -5,7 +5,7 @@
 models:
   # Main text generation model (same name as NVIDIA for uniform switching)
   llama3.1:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
     ttl: 1800  # Unload after 30 minutes of inactivity (1800 seconds)
     aliases:
       - llama3.1
@@ -13,7 +13,7 @@ models:
   
   # Evil/Uncensored model (same name as NVIDIA for uniform switching)
   darkidol:
-    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
+    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
     ttl: 1800  # Unload after 30 minutes of inactivity
     aliases:
       - darkidol
@@ -22,7 +22,7 @@ models:
   
   # Japanese language model (Llama 3.1 Swallow - Japanese optimized)
   swallow:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
     ttl: 1800  # Unload after 30 minutes of inactivity
     aliases:
       - swallow