From 9eb081efb13a74d697bdb3548228f53c65b934c5 Mon Sep 17 00:00:00 2001
From: koko210Serve <koko.yordanov@proton.me>
Date: Tue, 5 May 2026 16:53:34 +0300
Subject: [PATCH] llama-swap: use pre-built images (:cuda, :rocm) with
 GPU-specific flags

- Drop custom Dockerfiles; docker-compose uses ghcr.io pre-built images
  which ship llama-swap + llama-server with no pinned versions (always latest)
- NVIDIA GTX 1660 (6GB): add -fit off --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
  to fix OOM segfault with new llama.cpp b9014's GPU-side KV cache default
- AMD RX 6800 (16GB): flags unchanged; KV cache stays on GPU for max speed
- Both running llama-swap v211 + llama.cpp b9014 (2026-05-05)
---
 Dockerfile.llamaswap      | 13 --------
 Dockerfile.llamaswap-rocm | 68 ---------------------------------------
 docker-compose.yml        |  7 +---
 llama-swap-config.yaml    |  8 ++---
 4 files changed, 5 insertions(+), 91 deletions(-)
 delete mode 100644 Dockerfile.llamaswap
 delete mode 100644 Dockerfile.llamaswap-rocm

diff --git a/Dockerfile.llamaswap b/Dockerfile.llamaswap
deleted file mode 100644
index 55ab7dc..0000000
--- a/Dockerfile.llamaswap
+++ /dev/null
@@ -1,13 +0,0 @@
-FROM ghcr.io/mostlygeek/llama-swap:cuda
-
-USER root
-
-# Download and install llama-server binary (CUDA version)
-# Using the official pre-built binary from llama.cpp releases
-ADD --chmod=755 https://github.com/ggml-org/llama.cpp/releases/download/b4183/llama-server-cuda /usr/local/bin/llama-server
-
-# Verify it's executable
-RUN llama-server --version || echo "llama-server installed successfully"
-
-USER 1000:1000
-
diff --git a/Dockerfile.llamaswap-rocm b/Dockerfile.llamaswap-rocm
deleted file mode 100644
index dd092a3..0000000
--- a/Dockerfile.llamaswap-rocm
+++ /dev/null
@@ -1,68 +0,0 @@
-# Multi-stage build for llama-swap with ROCm support
-# Now using official llama.cpp ROCm image (PR #18439 merged Dec 29, 2025)
-
-# Stage 1: Build llama-swap UI
-FROM node:22-alpine AS ui-builder
-
-WORKDIR /build
-
-# Install git
-RUN apk add --no-cache git
-
-# Clone llama-swap
-RUN git clone https://github.com/mostlygeek/llama-swap.git
-
-# Build UI (now in ui-svelte directory)
-WORKDIR /build/llama-swap/ui-svelte
-RUN npm install && npm run build
-
-# Stage 2: Build llama-swap binary
-FROM golang:1.23-alpine AS swap-builder
-
-WORKDIR /build
-
-# Install git
-RUN apk add --no-cache git
-
-# Copy llama-swap source with built UI
-COPY --from=ui-builder /build/llama-swap /build/llama-swap
-
-# Build llama-swap binary
-WORKDIR /build/llama-swap
-RUN GOTOOLCHAIN=auto go build -o /build/llama-swap-binary .
-
-# Stage 3: Final runtime image using official llama.cpp ROCm image
-FROM ghcr.io/ggml-org/llama.cpp:server-rocm
-
-WORKDIR /app
-
-# Copy llama-swap binary from builder
-COPY --from=swap-builder /build/llama-swap-binary /app/llama-swap
-
-    # Make binaries executable
-    RUN chmod +x /app/llama-swap
-    
-    # Add existing ubuntu user (UID 1000) to GPU access groups (using host GIDs)
-    # GID 187 = render group on host, GID 989 = video/kfd group on host
-    RUN groupadd -g 187 hostrender && \
-        groupadd -g 989 hostvideo && \
-        usermod -aG hostrender,hostvideo ubuntu && \
-        chown -R ubuntu:ubuntu /app
-    
-    # Set environment for ROCm (RX 6800 is gfx1030)
-    ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
-    ENV ROCM_PATH=/opt/rocm
-    ENV HIP_VISIBLE_DEVICES=0
-    
-    USER ubuntu
-    
-    # Expose port
-    EXPOSE 8080
-    
-    # Health check
-    HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
-      CMD curl -f http://localhost:8080/health || exit 1
-    
-    # Override the base image's ENTRYPOINT and run llama-swap
-    ENTRYPOINT []
-    CMD ["/app/llama-swap", "-config", "/app/config.yaml", "-listen", "0.0.0.0:8080"]
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index b91ee2d..5c71e06 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -22,9 +22,7 @@ services:
       - LOG_LEVEL=debug  # Enable verbose logging for llama-swap
 
   llama-swap-amd:
-    build:
-      context: .
-      dockerfile: Dockerfile.llamaswap-rocm
+    image: ghcr.io/mostlygeek/llama-swap:rocm
     container_name: llama-swap-amd
     ports:
       - "8091:8080"  # Map host port 8091 to container port 8080
@@ -35,9 +33,6 @@ services:
     devices:
       - /dev/kfd:/dev/kfd
       - /dev/dri:/dev/dri
-    group_add:
-      - "985"  # video group
-      - "989"  # render group
     restart: unless-stopped
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
diff --git a/llama-swap-config.yaml b/llama-swap-config.yaml
index 1e4b1d5..1ec18f7 100644
--- a/llama-swap-config.yaml
+++ b/llama-swap-config.yaml
@@ -5,7 +5,7 @@ models:
   # Main text generation model (Llama 3.1 8B)
   # Custom chat template to disable built-in tool calling
   llama3.1:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 --chat-template-file /app/llama31_notool_template.jinja
     ttl: 1800  # Unload after 30 minutes of inactivity (1800 seconds)
     swap: true  # CRITICAL: Unload other models when loading this one
     aliases:
@@ -14,7 +14,7 @@ models:
   
   # Evil/Uncensored text generation model (DarkIdol-Llama 3.1 8B)
   darkidol:
-    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
     ttl: 1800  # Unload after 30 minutes of inactivity
     swap: true  # CRITICAL: Unload other models when loading this one
     aliases:
@@ -24,7 +24,7 @@ models:
   
   # Japanese language model (Llama 3.1 Swallow - Japanese optimized)
   swallow:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
     ttl: 1800  # Unload after 30 minutes of inactivity
     swap: true  # CRITICAL: Unload other models when loading this one
     aliases:
@@ -34,7 +34,7 @@ models:
     
   # Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs)
   vision:
-    cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
     ttl: 900  # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
     swap: true  # CRITICAL: Unload text models before loading vision
     aliases: