""" GPU Model Preloading Utility Preloads models on AMD GPU to take advantage of 16GB VRAM """ import aiohttp import asyncio import json import globals async def preload_amd_models(): """ Preload both text and vision models on AMD GPU Since AMD RX 6800 has 16GB VRAM, we can keep both loaded simultaneously """ print("🔧 Preloading models on AMD GPU...") # Preload text model try: async with aiohttp.ClientSession() as session: payload = { "model": "llama3.1", "messages": [{"role": "user", "content": "Hi"}], "max_tokens": 1 } async with session.post( f"{globals.LLAMA_AMD_URL}/v1/chat/completions", json=payload, timeout=aiohttp.ClientTimeout(total=60) ) as response: if response.status == 200: print("✅ Text model (llama3.1) preloaded on AMD GPU") else: print(f"⚠️ Text model preload returned status {response.status}") except Exception as e: print(f"⚠️ Failed to preload text model on AMD: {e}") # Preload vision model try: async with aiohttp.ClientSession() as session: # Create a minimal test image (1x1 white pixel) import base64 test_image = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==" payload = { "model": "vision", "messages": [ { "role": "user", "content": [ {"type": "text", "text": "What do you see?"}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{test_image}"}} ] } ], "max_tokens": 1 } async with session.post( f"{globals.LLAMA_AMD_URL}/v1/chat/completions", json=payload, timeout=aiohttp.ClientTimeout(total=120) ) as response: if response.status == 200: print("✅ Vision model preloaded on AMD GPU") else: print(f"⚠️ Vision model preload returned status {response.status}") except Exception as e: print(f"⚠️ Failed to preload vision model on AMD: {e}") print("✅ AMD GPU preload complete - both models ready")