Disable model warmup to improve switching speed
- Added --no-warmup flag to both llama3.1 and vision models - Reduces model switch time by 2-5 seconds per swap - No impact on response quality, only minor first-token latency - Better for frequent model switching use case and tight VRAM budget
This commit is contained in:
@@ -4,7 +4,7 @@
|
|||||||
models:
|
models:
|
||||||
# Main text generation model (Llama 3.1 8B)
|
# Main text generation model (Llama 3.1 8B)
|
||||||
llama3.1:
|
llama3.1:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0
|
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup
|
||||||
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
|
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
|
||||||
aliases:
|
aliases:
|
||||||
- llama3.1
|
- llama3.1
|
||||||
@@ -12,7 +12,7 @@ models:
|
|||||||
|
|
||||||
# Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs)
|
# Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs)
|
||||||
vision:
|
vision:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0
|
cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup
|
||||||
ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
|
ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
|
||||||
aliases:
|
aliases:
|
||||||
- vision
|
- vision
|
||||||
|
|||||||
Reference in New Issue
Block a user