Disabled KV cache offloading on llama-server and enabled Flash Attention. Performance gains in the tens.
This commit is contained in:
@@ -6,6 +6,7 @@ models:
|
|||||||
llama3.1:
|
llama3.1:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
|
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
|
||||||
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
|
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
|
||||||
|
swap: true # CRITICAL: Unload other models when loading this one
|
||||||
aliases:
|
aliases:
|
||||||
- llama3.1
|
- llama3.1
|
||||||
- text-model
|
- text-model
|
||||||
@@ -14,6 +15,7 @@ models:
|
|||||||
darkidol:
|
darkidol:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
|
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
|
||||||
ttl: 1800 # Unload after 30 minutes of inactivity
|
ttl: 1800 # Unload after 30 minutes of inactivity
|
||||||
|
swap: true # CRITICAL: Unload other models when loading this one
|
||||||
aliases:
|
aliases:
|
||||||
- darkidol
|
- darkidol
|
||||||
- evil-model
|
- evil-model
|
||||||
@@ -23,6 +25,7 @@ models:
|
|||||||
swallow:
|
swallow:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
|
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
|
||||||
ttl: 1800 # Unload after 30 minutes of inactivity
|
ttl: 1800 # Unload after 30 minutes of inactivity
|
||||||
|
swap: true # CRITICAL: Unload other models when loading this one
|
||||||
aliases:
|
aliases:
|
||||||
- swallow
|
- swallow
|
||||||
- japanese
|
- japanese
|
||||||
@@ -32,6 +35,7 @@ models:
|
|||||||
vision:
|
vision:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup
|
cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup
|
||||||
ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
|
ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
|
||||||
|
swap: true # CRITICAL: Unload text models before loading vision
|
||||||
aliases:
|
aliases:
|
||||||
- vision
|
- vision
|
||||||
- vision-model
|
- vision-model
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
models:
|
models:
|
||||||
# Main text generation model (same name as NVIDIA for uniform switching)
|
# Main text generation model (same name as NVIDIA for uniform switching)
|
||||||
llama3.1:
|
llama3.1:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
|
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
|
||||||
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
|
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
|
||||||
aliases:
|
aliases:
|
||||||
- llama3.1
|
- llama3.1
|
||||||
@@ -13,7 +13,7 @@ models:
|
|||||||
|
|
||||||
# Evil/Uncensored model (same name as NVIDIA for uniform switching)
|
# Evil/Uncensored model (same name as NVIDIA for uniform switching)
|
||||||
darkidol:
|
darkidol:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
|
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
|
||||||
ttl: 1800 # Unload after 30 minutes of inactivity
|
ttl: 1800 # Unload after 30 minutes of inactivity
|
||||||
aliases:
|
aliases:
|
||||||
- darkidol
|
- darkidol
|
||||||
@@ -22,7 +22,7 @@ models:
|
|||||||
|
|
||||||
# Japanese language model (Llama 3.1 Swallow - Japanese optimized)
|
# Japanese language model (Llama 3.1 Swallow - Japanese optimized)
|
||||||
swallow:
|
swallow:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
|
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
|
||||||
ttl: 1800 # Unload after 30 minutes of inactivity
|
ttl: 1800 # Unload after 30 minutes of inactivity
|
||||||
aliases:
|
aliases:
|
||||||
- swallow
|
- swallow
|
||||||
|
|||||||
Reference in New Issue
Block a user