Files
gogo2/test_amd_gpu_fix.py

105 lines
3.4 KiB
Python

#!/usr/bin/env python3
"""
Test AMD GPU compatibility and suggest fixes
"""
import torch
import sys
print("=" * 80)
print("AMD GPU Compatibility Test")
print("=" * 80)
# System info
print(f"\nPyTorch Version: {torch.__version__}")
print(f"ROCm Version: {torch.version.hip if hasattr(torch.version, 'hip') and torch.version.hip else 'Not available'}")
print(f"CUDA/ROCm Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"Device Name: {torch.cuda.get_device_name(0)}")
print(f"Device Count: {torch.cuda.device_count()}")
# Test 1: Simple tensor creation
print("\n" + "=" * 80)
print("Test 1: Simple Tensor Creation")
print("=" * 80)
try:
x = torch.tensor([1.0, 2.0, 3.0]).cuda()
print("✓ PASSED: Simple tensor creation on GPU")
except Exception as e:
print(f"✗ FAILED: {e}")
sys.exit(1)
# Test 2: Matrix multiplication
print("\n" + "=" * 80)
print("Test 2: Matrix Multiplication")
print("=" * 80)
try:
a = torch.randn(100, 100).cuda()
b = torch.randn(100, 100).cuda()
c = torch.matmul(a, b)
print("✓ PASSED: Matrix multiplication on GPU")
except Exception as e:
print(f"✗ FAILED: {e}")
sys.exit(1)
# Test 3: Linear layer (This is where gfx1151 fails)
print("\n" + "=" * 80)
print("Test 3: Neural Network Linear Layer (Critical Test)")
print("=" * 80)
try:
x = torch.randn(10, 20).cuda()
linear = torch.nn.Linear(20, 10).cuda()
y = linear(x)
print("✓ PASSED: Linear layer on GPU")
print("✓ Your GPU is fully compatible!")
except RuntimeError as e:
if "invalid device function" in str(e):
print(f"✗ FAILED: {e}")
print("\n" + "=" * 80)
print("DIAGNOSIS: GPU Architecture Not Supported")
print("=" * 80)
print("\nYour AMD GPU architecture (likely gfx1151) is not supported by this PyTorch build.")
print("\nRECOMMENDED ACTIONS:")
print("1. The application will automatically use CPU mode")
print("2. For GPU support, try: export HSA_OVERRIDE_GFX_VERSION=11.0.0")
print("3. Or reinstall PyTorch nightly: pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4")
print("\nSee AMD_GPU_FIX.md for detailed instructions")
sys.exit(1)
else:
raise
# Test 4: Conv2d layer
print("\n" + "=" * 80)
print("Test 4: Convolutional Layer")
print("=" * 80)
try:
x = torch.randn(1, 3, 32, 32).cuda()
conv = torch.nn.Conv2d(3, 16, 3).cuda()
y = conv(x)
print("✓ PASSED: Convolutional layer on GPU")
except Exception as e:
print(f"✗ FAILED: {e}")
# Test 5: Transformer layer
print("\n" + "=" * 80)
print("Test 5: Transformer Layer")
print("=" * 80)
try:
x = torch.randn(1, 10, 512).cuda()
transformer = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8).cuda()
y = transformer(x)
print("✓ PASSED: Transformer layer on GPU")
except Exception as e:
print(f"✗ FAILED: {e}")
print("\n" + "=" * 80)
print("ALL TESTS PASSED - GPU IS FULLY FUNCTIONAL!")
print("=" * 80)
else:
print("\n" + "=" * 80)
print("No CUDA/ROCm device detected")
print("=" * 80)
print("Application will run in CPU mode")