llama-swap: use pre-built images (:cuda, :rocm) with GPU-specific flags

- Drop custom Dockerfiles; docker-compose uses ghcr.io pre-built images which ship llama-swap + llama-server with no pinned versions (always latest) - NVIDIA GTX 1660 (6GB): add -fit off --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 to fix OOM segfault with new llama.cpp b9014's GPU-side KV cache default - AMD RX 6800 (16GB): flags unchanged; KV cache stays on GPU for max speed - Both running llama-swap v211 + llama.cpp b9014 (2026-05-05)
2026-05-05 16:53:34 +03:00
parent 4e28236b06
commit 9eb081efb1
4 changed files with 5 additions and 91 deletions
--- a/Dockerfile.llamaswap
+++ b/Dockerfile.llamaswap
@@ -1,13 +0,0 @@
 FROM ghcr.io/mostlygeek/llama-swap:cuda
 USER root
 # Download and install llama-server binary (CUDA version)
 # Using the official pre-built binary from llama.cpp releases
 ADD --chmod=755 https://github.com/ggml-org/llama.cpp/releases/download/b4183/llama-server-cuda /usr/local/bin/llama-server
 # Verify it's executable
 RUN llama-server --version || echo "llama-server installed successfully"
 USER 1000:1000
--- a/Dockerfile.llamaswap-rocm
+++ b/Dockerfile.llamaswap-rocm
@@ -1,68 +0,0 @@
 # Multi-stage build for llama-swap with ROCm support
 # Now using official llama.cpp ROCm image (PR #18439 merged Dec 29, 2025)
 # Stage 1: Build llama-swap UI
 FROM node:22-alpine AS ui-builder
 WORKDIR /build
 # Install git
 RUN apk add --no-cache git
 # Clone llama-swap
 RUN git clone https://github.com/mostlygeek/llama-swap.git
 # Build UI (now in ui-svelte directory)
 WORKDIR /build/llama-swap/ui-svelte
 RUN npm install && npm run build
 # Stage 2: Build llama-swap binary
 FROM golang:1.23-alpine AS swap-builder
 WORKDIR /build
 # Install git
 RUN apk add --no-cache git
 # Copy llama-swap source with built UI
 COPY --from=ui-builder /build/llama-swap /build/llama-swap
 # Build llama-swap binary
 WORKDIR /build/llama-swap
 RUN GOTOOLCHAIN=auto go build -o /build/llama-swap-binary .
 # Stage 3: Final runtime image using official llama.cpp ROCm image
 FROM ghcr.io/ggml-org/llama.cpp:server-rocm
 WORKDIR /app
 # Copy llama-swap binary from builder
 COPY --from=swap-builder /build/llama-swap-binary /app/llama-swap
    # Make binaries executable
    RUN chmod +x /app/llama-swap
    # Add existing ubuntu user (UID 1000) to GPU access groups (using host GIDs)
    # GID 187 = render group on host, GID 989 = video/kfd group on host
    RUN groupadd -g 187 hostrender && \
        groupadd -g 989 hostvideo && \
        usermod -aG hostrender,hostvideo ubuntu && \
        chown -R ubuntu:ubuntu /app
    # Set environment for ROCm (RX 6800 is gfx1030)
    ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
    ENV ROCM_PATH=/opt/rocm
    ENV HIP_VISIBLE_DEVICES=0
    USER ubuntu
    # Expose port
    EXPOSE 8080
    # Health check
    HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
      CMD curl -f http://localhost:8080/health || exit 1
    # Override the base image's ENTRYPOINT and run llama-swap
    ENTRYPOINT []
    CMD ["/app/llama-swap", "-config", "/app/config.yaml", "-listen", "0.0.0.0:8080"]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -22,9 +22,7 @@ services:
      - LOG_LEVEL=debug  # Enable verbose logging for llama-swap
  llama-swap-amd:
-    build:
+    image: ghcr.io/mostlygeek/llama-swap:rocm
      context: .
      dockerfile: Dockerfile.llamaswap-rocm
    container_name: llama-swap-amd
    ports:
      - "8091:8080"  # Map host port 8091 to container port 8080
@@ -35,9 +33,6 @@ services:
    devices:
      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
    group_add:
      - "985"  # video group
      - "989"  # render group
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
--- a/llama-swap-config.yaml
+++ b/llama-swap-config.yaml
@@ -5,7 +5,7 @@ models:
  # Main text generation model (Llama 3.1 8B)
  # Custom chat template to disable built-in tool calling
  llama3.1:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 --chat-template-file /app/llama31_notool_template.jinja
    ttl: 1800  # Unload after 30 minutes of inactivity (1800 seconds)
    swap: true  # CRITICAL: Unload other models when loading this one
    aliases:
@@ -14,7 +14,7 @@ models:
  # Evil/Uncensored text generation model (DarkIdol-Llama 3.1 8B)
  darkidol:
-    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
    ttl: 1800  # Unload after 30 minutes of inactivity
    swap: true  # CRITICAL: Unload other models when loading this one
    aliases:
@@ -24,7 +24,7 @@ models:
  # Japanese language model (Llama 3.1 Swallow - Japanese optimized)
  swallow:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
    ttl: 1800  # Unload after 30 minutes of inactivity
    swap: true  # CRITICAL: Unload other models when loading this one
    aliases:
@@ -34,7 +34,7 @@ models:
  # Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs)
  vision:
-    cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
    ttl: 900  # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
    swap: true  # CRITICAL: Unload text models before loading vision
    aliases: