Disabled KV cache offloading on llama-server and enabled Flash Attention. Performance gains in the tens.

2026-01-27 19:11:49 +02:00
parent dca58328e4
commit c0aaab0c3a
2 changed files with 7 additions and 3 deletions
--- a/llama-swap-config.yaml
+++ b/llama-swap-config.yaml
@@ -6,6 +6,7 @@ models:
  llama3.1:
    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
    ttl: 1800  # Unload after 30 minutes of inactivity (1800 seconds)
    swap: true  # CRITICAL: Unload other models when loading this one
    aliases:
      - llama3.1
      - text-model
@@ -14,6 +15,7 @@ models:
  darkidol:
    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
    ttl: 1800  # Unload after 30 minutes of inactivity
    swap: true  # CRITICAL: Unload other models when loading this one
    aliases:
      - darkidol
      - evil-model
@@ -23,6 +25,7 @@ models:
  swallow:
    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
    ttl: 1800  # Unload after 30 minutes of inactivity
    swap: true  # CRITICAL: Unload other models when loading this one
    aliases:
      - swallow
      - japanese
@@ -32,6 +35,7 @@ models:
  vision:
    cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup
    ttl: 900  # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
    swap: true  # CRITICAL: Unload text models before loading vision
    aliases:
      - vision
      - vision-model
--- a/llama-swap-rocm-config.yaml
+++ b/llama-swap-rocm-config.yaml
@@ -5,7 +5,7 @@
 models:
  # Main text generation model (same name as NVIDIA for uniform switching)
  llama3.1:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
    ttl: 1800  # Unload after 30 minutes of inactivity (1800 seconds)
    aliases:
      - llama3.1
@@ -13,7 +13,7 @@ models:
  # Evil/Uncensored model (same name as NVIDIA for uniform switching)
  darkidol:
-    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
+    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
    ttl: 1800  # Unload after 30 minutes of inactivity
    aliases:
      - darkidol
@@ -22,7 +22,7 @@ models:
  # Japanese language model (Llama 3.1 Swallow - Japanese optimized)
  swallow:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
    ttl: 1800  # Unload after 30 minutes of inactivity
    aliases:
      - swallow