llama-swap: use pre-built images (:cuda, :rocm) with GPU-specific flags

- Drop custom Dockerfiles; docker-compose uses ghcr.io pre-built images which ship llama-swap + llama-server with no pinned versions (always latest) - NVIDIA GTX 1660 (6GB): add -fit off --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 to fix OOM segfault with new llama.cpp b9014's GPU-side KV cache default - AMD RX 6800 (16GB): flags unchanged; KV cache stays on GPU for max speed - Both running llama-swap v211 + llama.cpp b9014 (2026-05-05)
2026-05-05 16:53:34 +03:00
parent 4e28236b06
commit 9eb081efb1
4 changed files with 5 additions and 91 deletions
--- a/Dockerfile.llamaswap
+++ b/Dockerfile.llamaswap
@@ -1,13 +0,0 @@
-FROM ghcr.io/mostlygeek/llama-swap:cuda
-
-USER root
-
-# Download and install llama-server binary (CUDA version)
-# Using the official pre-built binary from llama.cpp releases
-ADD --chmod=755 https://github.com/ggml-org/llama.cpp/releases/download/b4183/llama-server-cuda /usr/local/bin/llama-server
-
-# Verify it's executable
-RUN llama-server --version || echo "llama-server installed successfully"
-
-USER 1000:1000
-
--- a/Dockerfile.llamaswap-rocm
+++ b/Dockerfile.llamaswap-rocm
@@ -1,68 +0,0 @@
-# Multi-stage build for llama-swap with ROCm support
-# Now using official llama.cpp ROCm image (PR #18439 merged Dec 29, 2025)
-
-# Stage 1: Build llama-swap UI
-FROM node:22-alpine AS ui-builder
-
-WORKDIR /build
-
-# Install git
-RUN apk add --no-cache git
-
-# Clone llama-swap
-RUN git clone https://github.com/mostlygeek/llama-swap.git
-
-# Build UI (now in ui-svelte directory)
-WORKDIR /build/llama-swap/ui-svelte
-RUN npm install && npm run build
-
-# Stage 2: Build llama-swap binary
-FROM golang:1.23-alpine AS swap-builder
-
-WORKDIR /build
-
-# Install git
-RUN apk add --no-cache git
-
-# Copy llama-swap source with built UI
-COPY --from=ui-builder /build/llama-swap /build/llama-swap
-
-# Build llama-swap binary
-WORKDIR /build/llama-swap
-RUN GOTOOLCHAIN=auto go build -o /build/llama-swap-binary .
-
-# Stage 3: Final runtime image using official llama.cpp ROCm image
-FROM ghcr.io/ggml-org/llama.cpp:server-rocm
-
-WORKDIR /app
-
-# Copy llama-swap binary from builder
-COPY --from=swap-builder /build/llama-swap-binary /app/llama-swap
-
-    # Make binaries executable
-    RUN chmod +x /app/llama-swap
-    
-    # Add existing ubuntu user (UID 1000) to GPU access groups (using host GIDs)
-    # GID 187 = render group on host, GID 989 = video/kfd group on host
-    RUN groupadd -g 187 hostrender && \
-        groupadd -g 989 hostvideo && \
-        usermod -aG hostrender,hostvideo ubuntu && \
-        chown -R ubuntu:ubuntu /app
-    
-    # Set environment for ROCm (RX 6800 is gfx1030)
-    ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
-    ENV ROCM_PATH=/opt/rocm
-    ENV HIP_VISIBLE_DEVICES=0
-    
-    USER ubuntu
-    
-    # Expose port
-    EXPOSE 8080
-    
-    # Health check
-    HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
-      CMD curl -f http://localhost:8080/health || exit 1
-    
-    # Override the base image's ENTRYPOINT and run llama-swap
-    ENTRYPOINT []
-    CMD ["/app/llama-swap", "-config", "/app/config.yaml", "-listen", "0.0.0.0:8080"]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -22,9 +22,7 @@ services:
      - LOG_LEVEL=debug  # Enable verbose logging for llama-swap

  llama-swap-amd:
-    build:
-      context: .
-      dockerfile: Dockerfile.llamaswap-rocm
+    image: ghcr.io/mostlygeek/llama-swap:rocm
    container_name: llama-swap-amd
    ports:
      - "8091:8080"  # Map host port 8091 to container port 8080
@@ -35,9 +33,6 @@ services:
    devices:
      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
-    group_add:
-      - "985"  # video group
-      - "989"  # render group
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
--- a/llama-swap-config.yaml
+++ b/llama-swap-config.yaml
@@ -5,7 +5,7 @@ models:
  # Main text generation model (Llama 3.1 8B)
  # Custom chat template to disable built-in tool calling
  llama3.1:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 --chat-template-file /app/llama31_notool_template.jinja
    ttl: 1800  # Unload after 30 minutes of inactivity (1800 seconds)
    swap: true  # CRITICAL: Unload other models when loading this one
    aliases:
@@ -14,7 +14,7 @@ models:
  
  # Evil/Uncensored text generation model (DarkIdol-Llama 3.1 8B)
  darkidol:
-    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
    ttl: 1800  # Unload after 30 minutes of inactivity
    swap: true  # CRITICAL: Unload other models when loading this one
    aliases:
@@ -24,7 +24,7 @@ models:
  
  # Japanese language model (Llama 3.1 Swallow - Japanese optimized)
  swallow:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
    ttl: 1800  # Unload after 30 minutes of inactivity
    swap: true  # CRITICAL: Unload other models when loading this one
    aliases:
@@ -34,7 +34,7 @@ models:
    
  # Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs)
  vision:
-    cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
    ttl: 900  # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
    swap: true  # CRITICAL: Unload text models before loading vision
    aliases: