gogo2/test_amd_gpu_fix.py

#!/usr/bin/env python3
"""
Test AMD GPU compatibility and suggest fixes
"""

import torch
import sys

print("=" * 80)
print("AMD GPU Compatibility Test")
print("=" * 80)

# System info
print(f"\nPyTorch Version: {torch.__version__}")
print(f"ROCm Version: {torch.version.hip if hasattr(torch.version, 'hip') and torch.version.hip else 'Not available'}")
print(f"CUDA/ROCm Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    print(f"Device Count: {torch.cuda.device_count()}")

    # Test 1: Simple tensor creation
    print("\n" + "=" * 80)
    print("Test 1: Simple Tensor Creation")
    print("=" * 80)
    try:
        x = torch.tensor([1.0, 2.0, 3.0]).cuda()
        print("PASSED: Simple tensor creation on GPU")
    except Exception as e:
        print(f"✗ FAILED: {e}")
        sys.exit(1)

    # Test 2: Matrix multiplication
    print("\n" + "=" * 80)
    print("Test 2: Matrix Multiplication")
    print("=" * 80)
    try:
        a = torch.randn(100, 100).cuda()
        b = torch.randn(100, 100).cuda()
        c = torch.matmul(a, b)
        print("PASSED: Matrix multiplication on GPU")
    except Exception as e:
        print(f"✗ FAILED: {e}")
        sys.exit(1)

    # Test 3: Linear layer (This is where gfx1151 fails)
    print("\n" + "=" * 80)
    print("Test 3: Neural Network Linear Layer (Critical Test)")
    print("=" * 80)
    try:
        x = torch.randn(10, 20).cuda()
        linear = torch.nn.Linear(20, 10).cuda()
        y = linear(x)
        print("PASSED: Linear layer on GPU")
        print("Your GPU is fully compatible!")
    except RuntimeError as e:
        if "invalid device function" in str(e):
            print(f"✗ FAILED: {e}")
            print("\n" + "=" * 80)
            print("DIAGNOSIS: GPU Architecture Not Supported")
            print("=" * 80)
            print("\nYour AMD GPU architecture (likely gfx1151) is not supported by this PyTorch build.")
            print("\nRECOMMENDED ACTIONS:")
            print("1. The application will automatically use CPU mode")
            print("2. For GPU support, try: export HSA_OVERRIDE_GFX_VERSION=11.0.0")
            print("3. Or reinstall PyTorch nightly: pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4")
            print("\nSee AMD_GPU_FIX.md for detailed instructions")
            sys.exit(1)
        else:
            raise

    # Test 4: Conv2d layer
    print("\n" + "=" * 80)
    print("Test 4: Convolutional Layer")
    print("=" * 80)
    try:
        x = torch.randn(1, 3, 32, 32).cuda()
        conv = torch.nn.Conv2d(3, 16, 3).cuda()
        y = conv(x)
        print("PASSED: Convolutional layer on GPU")
    except Exception as e:
        print(f"✗ FAILED: {e}")

    # Test 5: Transformer layer
    print("\n" + "=" * 80)
    print("Test 5: Transformer Layer")
    print("=" * 80)
    try:
        x = torch.randn(1, 10, 512).cuda()
        transformer = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8).cuda()
        y = transformer(x)
        print("PASSED: Transformer layer on GPU")
    except Exception as e:
        print(f"✗ FAILED: {e}")

    print("\n" + "=" * 80)
    print("ALL TESTS PASSED - GPU IS FULLY FUNCTIONAL!")
    print("=" * 80)

else:
    print("\n" + "=" * 80)
    print("No CUDA/ROCm device detected")
    print("=" * 80)
    print("Application will run in CPU mode")