From 9eb081efb13a74d697bdb3548228f53c65b934c5 Mon Sep 17 00:00:00 2001 From: koko210Serve Date: Tue, 5 May 2026 16:53:34 +0300 Subject: [PATCH] llama-swap: use pre-built images (:cuda, :rocm) with GPU-specific flags - Drop custom Dockerfiles; docker-compose uses ghcr.io pre-built images which ship llama-swap + llama-server with no pinned versions (always latest) - NVIDIA GTX 1660 (6GB): add -fit off --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 to fix OOM segfault with new llama.cpp b9014's GPU-side KV cache default - AMD RX 6800 (16GB): flags unchanged; KV cache stays on GPU for max speed - Both running llama-swap v211 + llama.cpp b9014 (2026-05-05) --- Dockerfile.llamaswap | 13 -------- Dockerfile.llamaswap-rocm | 68 --------------------------------------- docker-compose.yml | 7 +--- llama-swap-config.yaml | 8 ++--- 4 files changed, 5 insertions(+), 91 deletions(-) delete mode 100644 Dockerfile.llamaswap delete mode 100644 Dockerfile.llamaswap-rocm diff --git a/Dockerfile.llamaswap b/Dockerfile.llamaswap deleted file mode 100644 index 55ab7dc..0000000 --- a/Dockerfile.llamaswap +++ /dev/null @@ -1,13 +0,0 @@ -FROM ghcr.io/mostlygeek/llama-swap:cuda - -USER root - -# Download and install llama-server binary (CUDA version) -# Using the official pre-built binary from llama.cpp releases -ADD --chmod=755 https://github.com/ggml-org/llama.cpp/releases/download/b4183/llama-server-cuda /usr/local/bin/llama-server - -# Verify it's executable -RUN llama-server --version || echo "llama-server installed successfully" - -USER 1000:1000 - diff --git a/Dockerfile.llamaswap-rocm b/Dockerfile.llamaswap-rocm deleted file mode 100644 index dd092a3..0000000 --- a/Dockerfile.llamaswap-rocm +++ /dev/null @@ -1,68 +0,0 @@ -# Multi-stage build for llama-swap with ROCm support -# Now using official llama.cpp ROCm image (PR #18439 merged Dec 29, 2025) - -# Stage 1: Build llama-swap UI -FROM node:22-alpine AS ui-builder - -WORKDIR /build - -# Install git -RUN apk add --no-cache git - -# Clone llama-swap -RUN git clone https://github.com/mostlygeek/llama-swap.git - -# Build UI (now in ui-svelte directory) -WORKDIR /build/llama-swap/ui-svelte -RUN npm install && npm run build - -# Stage 2: Build llama-swap binary -FROM golang:1.23-alpine AS swap-builder - -WORKDIR /build - -# Install git -RUN apk add --no-cache git - -# Copy llama-swap source with built UI -COPY --from=ui-builder /build/llama-swap /build/llama-swap - -# Build llama-swap binary -WORKDIR /build/llama-swap -RUN GOTOOLCHAIN=auto go build -o /build/llama-swap-binary . - -# Stage 3: Final runtime image using official llama.cpp ROCm image -FROM ghcr.io/ggml-org/llama.cpp:server-rocm - -WORKDIR /app - -# Copy llama-swap binary from builder -COPY --from=swap-builder /build/llama-swap-binary /app/llama-swap - - # Make binaries executable - RUN chmod +x /app/llama-swap - - # Add existing ubuntu user (UID 1000) to GPU access groups (using host GIDs) - # GID 187 = render group on host, GID 989 = video/kfd group on host - RUN groupadd -g 187 hostrender && \ - groupadd -g 989 hostvideo && \ - usermod -aG hostrender,hostvideo ubuntu && \ - chown -R ubuntu:ubuntu /app - - # Set environment for ROCm (RX 6800 is gfx1030) - ENV HSA_OVERRIDE_GFX_VERSION=10.3.0 - ENV ROCM_PATH=/opt/rocm - ENV HIP_VISIBLE_DEVICES=0 - - USER ubuntu - - # Expose port - EXPOSE 8080 - - # Health check - HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \ - CMD curl -f http://localhost:8080/health || exit 1 - - # Override the base image's ENTRYPOINT and run llama-swap - ENTRYPOINT [] - CMD ["/app/llama-swap", "-config", "/app/config.yaml", "-listen", "0.0.0.0:8080"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index b91ee2d..5c71e06 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,9 +22,7 @@ services: - LOG_LEVEL=debug # Enable verbose logging for llama-swap llama-swap-amd: - build: - context: . - dockerfile: Dockerfile.llamaswap-rocm + image: ghcr.io/mostlygeek/llama-swap:rocm container_name: llama-swap-amd ports: - "8091:8080" # Map host port 8091 to container port 8080 @@ -35,9 +33,6 @@ services: devices: - /dev/kfd:/dev/kfd - /dev/dri:/dev/dri - group_add: - - "985" # video group - - "989" # render group restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8080/health"] diff --git a/llama-swap-config.yaml b/llama-swap-config.yaml index 1e4b1d5..1ec18f7 100644 --- a/llama-swap-config.yaml +++ b/llama-swap-config.yaml @@ -5,7 +5,7 @@ models: # Main text generation model (Llama 3.1 8B) # Custom chat template to disable built-in tool calling llama3.1: - cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja + cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 --chat-template-file /app/llama31_notool_template.jinja ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds) swap: true # CRITICAL: Unload other models when loading this one aliases: @@ -14,7 +14,7 @@ models: # Evil/Uncensored text generation model (DarkIdol-Llama 3.1 8B) darkidol: - cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on + cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 ttl: 1800 # Unload after 30 minutes of inactivity swap: true # CRITICAL: Unload other models when loading this one aliases: @@ -24,7 +24,7 @@ models: # Japanese language model (Llama 3.1 Swallow - Japanese optimized) swallow: - cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on + cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 ttl: 1800 # Unload after 30 minutes of inactivity swap: true # CRITICAL: Unload other models when loading this one aliases: @@ -34,7 +34,7 @@ models: # Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs) vision: - cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup --flash-attn on + cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds) swap: true # CRITICAL: Unload text models before loading vision aliases: