Disabled KV cache offloading on llama-server and enabled Flash Attention. Performance gains in the tens.

This commit is contained in:
2026-01-27 19:11:49 +02:00
parent dca58328e4
commit c0aaab0c3a
2 changed files with 7 additions and 3 deletions

View File

@@ -5,7 +5,7 @@
models:
# Main text generation model (same name as NVIDIA for uniform switching)
llama3.1:
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
aliases:
- llama3.1
@@ -13,7 +13,7 @@ models:
# Evil/Uncensored model (same name as NVIDIA for uniform switching)
darkidol:
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
ttl: 1800 # Unload after 30 minutes of inactivity
aliases:
- darkidol
@@ -22,7 +22,7 @@ models:
# Japanese language model (Llama 3.1 Swallow - Japanese optimized)
swallow:
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
ttl: 1800 # Unload after 30 minutes of inactivity
aliases:
- swallow