From 675bb21653faa830713c43ae72ae7c8bf7cb6746 Mon Sep 17 00:00:00 2001 From: koko210Serve Date: Wed, 10 Dec 2025 10:09:37 +0200 Subject: [PATCH] Disable model warmup to improve switching speed - Added --no-warmup flag to both llama3.1 and vision models - Reduces model switch time by 2-5 seconds per swap - No impact on response quality, only minor first-token latency - Better for frequent model switching use case and tight VRAM budget --- llama-swap-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama-swap-config.yaml b/llama-swap-config.yaml index 00415a1..0134e85 100644 --- a/llama-swap-config.yaml +++ b/llama-swap-config.yaml @@ -4,7 +4,7 @@ models: # Main text generation model (Llama 3.1 8B) llama3.1: - cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 + cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds) aliases: - llama3.1 @@ -12,7 +12,7 @@ models: # Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs) vision: - cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 + cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds) aliases: - vision