llama-swap: use pre-built images (:cuda, :rocm) with GPU-specific flags
- Drop custom Dockerfiles; docker-compose uses ghcr.io pre-built images which ship llama-swap + llama-server with no pinned versions (always latest) - NVIDIA GTX 1660 (6GB): add -fit off --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 to fix OOM segfault with new llama.cpp b9014's GPU-side KV cache default - AMD RX 6800 (16GB): flags unchanged; KV cache stays on GPU for max speed - Both running llama-swap v211 + llama.cpp b9014 (2026-05-05)
This commit is contained in:
@@ -1,13 +0,0 @@
|
|||||||
FROM ghcr.io/mostlygeek/llama-swap:cuda
|
|
||||||
|
|
||||||
USER root
|
|
||||||
|
|
||||||
# Download and install llama-server binary (CUDA version)
|
|
||||||
# Using the official pre-built binary from llama.cpp releases
|
|
||||||
ADD --chmod=755 https://github.com/ggml-org/llama.cpp/releases/download/b4183/llama-server-cuda /usr/local/bin/llama-server
|
|
||||||
|
|
||||||
# Verify it's executable
|
|
||||||
RUN llama-server --version || echo "llama-server installed successfully"
|
|
||||||
|
|
||||||
USER 1000:1000
|
|
||||||
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
# Multi-stage build for llama-swap with ROCm support
|
|
||||||
# Now using official llama.cpp ROCm image (PR #18439 merged Dec 29, 2025)
|
|
||||||
|
|
||||||
# Stage 1: Build llama-swap UI
|
|
||||||
FROM node:22-alpine AS ui-builder
|
|
||||||
|
|
||||||
WORKDIR /build
|
|
||||||
|
|
||||||
# Install git
|
|
||||||
RUN apk add --no-cache git
|
|
||||||
|
|
||||||
# Clone llama-swap
|
|
||||||
RUN git clone https://github.com/mostlygeek/llama-swap.git
|
|
||||||
|
|
||||||
# Build UI (now in ui-svelte directory)
|
|
||||||
WORKDIR /build/llama-swap/ui-svelte
|
|
||||||
RUN npm install && npm run build
|
|
||||||
|
|
||||||
# Stage 2: Build llama-swap binary
|
|
||||||
FROM golang:1.23-alpine AS swap-builder
|
|
||||||
|
|
||||||
WORKDIR /build
|
|
||||||
|
|
||||||
# Install git
|
|
||||||
RUN apk add --no-cache git
|
|
||||||
|
|
||||||
# Copy llama-swap source with built UI
|
|
||||||
COPY --from=ui-builder /build/llama-swap /build/llama-swap
|
|
||||||
|
|
||||||
# Build llama-swap binary
|
|
||||||
WORKDIR /build/llama-swap
|
|
||||||
RUN GOTOOLCHAIN=auto go build -o /build/llama-swap-binary .
|
|
||||||
|
|
||||||
# Stage 3: Final runtime image using official llama.cpp ROCm image
|
|
||||||
FROM ghcr.io/ggml-org/llama.cpp:server-rocm
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# Copy llama-swap binary from builder
|
|
||||||
COPY --from=swap-builder /build/llama-swap-binary /app/llama-swap
|
|
||||||
|
|
||||||
# Make binaries executable
|
|
||||||
RUN chmod +x /app/llama-swap
|
|
||||||
|
|
||||||
# Add existing ubuntu user (UID 1000) to GPU access groups (using host GIDs)
|
|
||||||
# GID 187 = render group on host, GID 989 = video/kfd group on host
|
|
||||||
RUN groupadd -g 187 hostrender && \
|
|
||||||
groupadd -g 989 hostvideo && \
|
|
||||||
usermod -aG hostrender,hostvideo ubuntu && \
|
|
||||||
chown -R ubuntu:ubuntu /app
|
|
||||||
|
|
||||||
# Set environment for ROCm (RX 6800 is gfx1030)
|
|
||||||
ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
|
|
||||||
ENV ROCM_PATH=/opt/rocm
|
|
||||||
ENV HIP_VISIBLE_DEVICES=0
|
|
||||||
|
|
||||||
USER ubuntu
|
|
||||||
|
|
||||||
# Expose port
|
|
||||||
EXPOSE 8080
|
|
||||||
|
|
||||||
# Health check
|
|
||||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
|
||||||
CMD curl -f http://localhost:8080/health || exit 1
|
|
||||||
|
|
||||||
# Override the base image's ENTRYPOINT and run llama-swap
|
|
||||||
ENTRYPOINT []
|
|
||||||
CMD ["/app/llama-swap", "-config", "/app/config.yaml", "-listen", "0.0.0.0:8080"]
|
|
||||||
@@ -22,9 +22,7 @@ services:
|
|||||||
- LOG_LEVEL=debug # Enable verbose logging for llama-swap
|
- LOG_LEVEL=debug # Enable verbose logging for llama-swap
|
||||||
|
|
||||||
llama-swap-amd:
|
llama-swap-amd:
|
||||||
build:
|
image: ghcr.io/mostlygeek/llama-swap:rocm
|
||||||
context: .
|
|
||||||
dockerfile: Dockerfile.llamaswap-rocm
|
|
||||||
container_name: llama-swap-amd
|
container_name: llama-swap-amd
|
||||||
ports:
|
ports:
|
||||||
- "8091:8080" # Map host port 8091 to container port 8080
|
- "8091:8080" # Map host port 8091 to container port 8080
|
||||||
@@ -35,9 +33,6 @@ services:
|
|||||||
devices:
|
devices:
|
||||||
- /dev/kfd:/dev/kfd
|
- /dev/kfd:/dev/kfd
|
||||||
- /dev/dri:/dev/dri
|
- /dev/dri:/dev/dri
|
||||||
group_add:
|
|
||||||
- "985" # video group
|
|
||||||
- "989" # render group
|
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
|
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ models:
|
|||||||
# Main text generation model (Llama 3.1 8B)
|
# Main text generation model (Llama 3.1 8B)
|
||||||
# Custom chat template to disable built-in tool calling
|
# Custom chat template to disable built-in tool calling
|
||||||
llama3.1:
|
llama3.1:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja
|
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 --chat-template-file /app/llama31_notool_template.jinja
|
||||||
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
|
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
|
||||||
swap: true # CRITICAL: Unload other models when loading this one
|
swap: true # CRITICAL: Unload other models when loading this one
|
||||||
aliases:
|
aliases:
|
||||||
@@ -14,7 +14,7 @@ models:
|
|||||||
|
|
||||||
# Evil/Uncensored text generation model (DarkIdol-Llama 3.1 8B)
|
# Evil/Uncensored text generation model (DarkIdol-Llama 3.1 8B)
|
||||||
darkidol:
|
darkidol:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
|
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
|
||||||
ttl: 1800 # Unload after 30 minutes of inactivity
|
ttl: 1800 # Unload after 30 minutes of inactivity
|
||||||
swap: true # CRITICAL: Unload other models when loading this one
|
swap: true # CRITICAL: Unload other models when loading this one
|
||||||
aliases:
|
aliases:
|
||||||
@@ -24,7 +24,7 @@ models:
|
|||||||
|
|
||||||
# Japanese language model (Llama 3.1 Swallow - Japanese optimized)
|
# Japanese language model (Llama 3.1 Swallow - Japanese optimized)
|
||||||
swallow:
|
swallow:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
|
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
|
||||||
ttl: 1800 # Unload after 30 minutes of inactivity
|
ttl: 1800 # Unload after 30 minutes of inactivity
|
||||||
swap: true # CRITICAL: Unload other models when loading this one
|
swap: true # CRITICAL: Unload other models when loading this one
|
||||||
aliases:
|
aliases:
|
||||||
@@ -34,7 +34,7 @@ models:
|
|||||||
|
|
||||||
# Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs)
|
# Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs)
|
||||||
vision:
|
vision:
|
||||||
cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup --flash-attn on
|
cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
|
||||||
ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
|
ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
|
||||||
swap: true # CRITICAL: Unload text models before loading vision
|
swap: true # CRITICAL: Unload text models before loading vision
|
||||||
aliases:
|
aliases:
|
||||||
|
|||||||
Reference in New Issue
Block a user