version: '3.8' services: # Working AMD GPU Model Runner - Using Docker Model Runner (not llama.cpp) model-runner: image: docker/model-runner:latest container_name: model-runner privileged: true user: "0:0" # Run as root to fix permission issues ports: - "11434:11434" # Main API port (Ollama-compatible) - "8083:8080" # Alternative API port environment: - HSA_OVERRIDE_GFX_VERSION=11.0.0 # AMD GPU version override - GPU_LAYERS=35 - THREADS=8 - BATCH_SIZE=512 - CONTEXT_SIZE=4096 - DISPLAY=${DISPLAY} - USER=${USER} devices: - /dev/kfd:/dev/kfd - /dev/dri:/dev/dri group_add: - video volumes: - ./models:/models:rw - ./data:/data:rw - /home/${USER}:/home/${USER}:rslave working_dir: /models restart: unless-stopped command: > /app/model-runner serve --port 11434 --host 0.0.0.0 --gpu-layers 35 --threads 8 --batch-size 512 --ctx-size 4096 --parallel --cont-batching --log-level info --log-format json healthcheck: test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"] interval: 30s timeout: 10s retries: 3 start_period: 40s networks: - model-runner-network volumes: model_runner_data: driver: local networks: model-runner-network: driver: bridge