llama-swap: use pre-built images (:cuda, :rocm) with GPU-specific flags

- Drop custom Dockerfiles; docker-compose uses ghcr.io pre-built images which ship llama-swap + llama-server with no pinned versions (always latest) - NVIDIA GTX 1660 (6GB): add -fit off --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 to fix OOM segfault with new llama.cpp b9014's GPU-side KV cache default - AMD RX 6800 (16GB): flags unchanged; KV cache stays on GPU for max speed - Both running llama-swap v211 + llama.cpp b9014 (2026-05-05)
2026-05-05 16:53:34 +03:00
parent 4e28236b06
commit 9eb081efb1
4 changed files with 5 additions and 91 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -22,9 +22,7 @@ services:
      - LOG_LEVEL=debug  # Enable verbose logging for llama-swap

  llama-swap-amd:
-    build:
-      context: .
-      dockerfile: Dockerfile.llamaswap-rocm
+    image: ghcr.io/mostlygeek/llama-swap:rocm
    container_name: llama-swap-amd
    ports:
      - "8091:8080"  # Map host port 8091 to container port 8080
@@ -35,9 +33,6 @@ services:
    devices:
      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
-    group_add:
-      - "985"  # video group
-      - "989"  # render group
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]