llama-swap: use pre-built images (:cuda, :rocm) with GPU-specific flags

- Drop custom Dockerfiles; docker-compose uses ghcr.io pre-built images
  which ship llama-swap + llama-server with no pinned versions (always latest)
- NVIDIA GTX 1660 (6GB): add -fit off --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
  to fix OOM segfault with new llama.cpp b9014's GPU-side KV cache default
- AMD RX 6800 (16GB): flags unchanged; KV cache stays on GPU for max speed
- Both running llama-swap v211 + llama.cpp b9014 (2026-05-05)
This commit is contained in:
2026-05-05 16:53:34 +03:00
parent 4e28236b06
commit 9eb081efb1
4 changed files with 5 additions and 91 deletions

View File

@@ -1,13 +0,0 @@
FROM ghcr.io/mostlygeek/llama-swap:cuda
USER root
# Download and install llama-server binary (CUDA version)
# Using the official pre-built binary from llama.cpp releases
ADD --chmod=755 https://github.com/ggml-org/llama.cpp/releases/download/b4183/llama-server-cuda /usr/local/bin/llama-server
# Verify it's executable
RUN llama-server --version || echo "llama-server installed successfully"
USER 1000:1000

View File

@@ -1,68 +0,0 @@
# Multi-stage build for llama-swap with ROCm support
# Now using official llama.cpp ROCm image (PR #18439 merged Dec 29, 2025)
# Stage 1: Build llama-swap UI
FROM node:22-alpine AS ui-builder
WORKDIR /build
# Install git
RUN apk add --no-cache git
# Clone llama-swap
RUN git clone https://github.com/mostlygeek/llama-swap.git
# Build UI (now in ui-svelte directory)
WORKDIR /build/llama-swap/ui-svelte
RUN npm install && npm run build
# Stage 2: Build llama-swap binary
FROM golang:1.23-alpine AS swap-builder
WORKDIR /build
# Install git
RUN apk add --no-cache git
# Copy llama-swap source with built UI
COPY --from=ui-builder /build/llama-swap /build/llama-swap
# Build llama-swap binary
WORKDIR /build/llama-swap
RUN GOTOOLCHAIN=auto go build -o /build/llama-swap-binary .
# Stage 3: Final runtime image using official llama.cpp ROCm image
FROM ghcr.io/ggml-org/llama.cpp:server-rocm
WORKDIR /app
# Copy llama-swap binary from builder
COPY --from=swap-builder /build/llama-swap-binary /app/llama-swap
# Make binaries executable
RUN chmod +x /app/llama-swap
# Add existing ubuntu user (UID 1000) to GPU access groups (using host GIDs)
# GID 187 = render group on host, GID 989 = video/kfd group on host
RUN groupadd -g 187 hostrender && \
groupadd -g 989 hostvideo && \
usermod -aG hostrender,hostvideo ubuntu && \
chown -R ubuntu:ubuntu /app
# Set environment for ROCm (RX 6800 is gfx1030)
ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
ENV ROCM_PATH=/opt/rocm
ENV HIP_VISIBLE_DEVICES=0
USER ubuntu
# Expose port
EXPOSE 8080
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8080/health || exit 1
# Override the base image's ENTRYPOINT and run llama-swap
ENTRYPOINT []
CMD ["/app/llama-swap", "-config", "/app/config.yaml", "-listen", "0.0.0.0:8080"]

View File

@@ -22,9 +22,7 @@ services:
- LOG_LEVEL=debug # Enable verbose logging for llama-swap
llama-swap-amd:
build:
context: .
dockerfile: Dockerfile.llamaswap-rocm
image: ghcr.io/mostlygeek/llama-swap:rocm
container_name: llama-swap-amd
ports:
- "8091:8080" # Map host port 8091 to container port 8080
@@ -35,9 +33,6 @@ services:
devices:
- /dev/kfd:/dev/kfd
- /dev/dri:/dev/dri
group_add:
- "985" # video group
- "989" # render group
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]

View File

@@ -5,7 +5,7 @@ models:
# Main text generation model (Llama 3.1 8B)
# Custom chat template to disable built-in tool calling
llama3.1:
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 --chat-template-file /app/llama31_notool_template.jinja
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
swap: true # CRITICAL: Unload other models when loading this one
aliases:
@@ -14,7 +14,7 @@ models:
# Evil/Uncensored text generation model (DarkIdol-Llama 3.1 8B)
darkidol:
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
ttl: 1800 # Unload after 30 minutes of inactivity
swap: true # CRITICAL: Unload other models when loading this one
aliases:
@@ -24,7 +24,7 @@ models:
# Japanese language model (Llama 3.1 Swallow - Japanese optimized)
swallow:
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
ttl: 1800 # Unload after 30 minutes of inactivity
swap: true # CRITICAL: Unload other models when loading this one
aliases:
@@ -34,7 +34,7 @@ models:
# Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs)
vision:
cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup --flash-attn on
cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
swap: true # CRITICAL: Unload text models before loading vision
aliases: