#!/usr/bin/env python3 """ Test GPU Training - Check if our models actually train and use GPU """ import torch import torch.nn as nn import torch.optim as optim import numpy as np import time import logging from pathlib import Path import sys # Add project root to path project_root = Path(__file__).parent sys.path.insert(0, str(project_root)) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def test_gpu_availability(): """Test if GPU is available and working""" logger.info("=== GPU AVAILABILITY TEST ===") print(f"PyTorch version: {torch.__version__}") print(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): print(f"CUDA version: {torch.version.cuda}") print(f"GPU count: {torch.cuda.device_count()}") for i in range(torch.cuda.device_count()): print(f"GPU {i}: {torch.cuda.get_device_name(i)}") print(f" Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB") # Test GPU operations try: device = torch.device('cuda:0') x = torch.randn(100, 100, device=device) y = torch.randn(100, 100, device=device) z = torch.mm(x, y) print(f"✅ GPU operations working: {z.device}") return True except Exception as e: print(f"❌ GPU operations failed: {e}") return False else: print("❌ No CUDA available") return False def test_simple_training(): """Test if a simple neural network actually trains""" logger.info("=== SIMPLE TRAINING TEST ===") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Using device: {device}") # Create a simple model class SimpleNet(nn.Module): def __init__(self): super().__init__() self.layers = nn.Sequential( nn.Linear(10, 64), nn.ReLU(), nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 3) ) def forward(self, x): return self.layers(x) model = SimpleNet().to(device) optimizer = optim.Adam(model.parameters(), lr=0.001) criterion = nn.CrossEntropyLoss() # Generate some dummy data X = torch.randn(1000, 10, device=device) y = torch.randint(0, 3, (1000,), device=device) print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}") print(f"Data shape: {X.shape}, Labels shape: {y.shape}") # Training loop initial_loss = None losses = [] print("Training for 100 steps...") start_time = time.time() for step in range(100): # Forward pass outputs = model(X) loss = criterion(outputs, y) # Backward pass optimizer.zero_grad() loss.backward() optimizer.step() loss_val = loss.item() losses.append(loss_val) if step == 0: initial_loss = loss_val if step % 20 == 0: print(f"Step {step}: Loss = {loss_val:.4f}") end_time = time.time() final_loss = losses[-1] print(f"Training completed in {end_time - start_time:.2f} seconds") print(f"Initial loss: {initial_loss:.4f}") print(f"Final loss: {final_loss:.4f}") print(f"Loss reduction: {initial_loss - final_loss:.4f}") # Check if training actually happened if final_loss < initial_loss * 0.9: # At least 10% reduction print("✅ Training is working - loss decreased significantly") return True else: print("❌ Training may not be working - loss didn't decrease much") return False def test_our_models(): """Test if our actual models can train""" logger.info("=== OUR MODELS TEST ===") try: # Test DQN Agent from NN.models.dqn_agent import DQNAgent device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Testing DQN Agent on {device}") # Create agent state_shape = (100,) # Simple state agent = DQNAgent( state_shape=state_shape, n_actions=3, learning_rate=0.001, device=device ) print(f"✅ DQN Agent created successfully") print(f" Device: {agent.device}") print(f" Policy net device: {next(agent.policy_net.parameters()).device}") # Test training step state = np.random.randn(100).astype(np.float32) action = 1 reward = 0.5 next_state = np.random.randn(100).astype(np.float32) done = False # Add experience and train agent.remember(state, action, reward, next_state, done) # Add more experiences for _ in range(200): # Need enough for batch s = np.random.randn(100).astype(np.float32) a = np.random.randint(0, 3) r = np.random.randn() * 0.1 ns = np.random.randn(100).astype(np.float32) d = np.random.random() < 0.1 agent.remember(s, a, r, ns, d) # Test training print("Testing training step...") initial_loss = None for i in range(10): loss = agent.replay() if loss > 0: if initial_loss is None: initial_loss = loss print(f" Step {i}: Loss = {loss:.4f}") if initial_loss is not None: print("✅ DQN training is working") else: print("❌ DQN training returned no loss") return True except Exception as e: print(f"❌ Error testing our models: {e}") import traceback traceback.print_exc() return False def test_cnn_model(): """Test CNN model training""" logger.info("=== CNN MODEL TEST ===") try: from NN.models.enhanced_cnn import EnhancedCNN device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Testing Enhanced CNN on {device}") # Create model state_dim = (3, 20, 26) # 3 timeframes, 20 window, 26 features n_actions = 3 model = EnhancedCNN(state_dim, n_actions).to(device) optimizer = optim.Adam(model.parameters(), lr=0.001) criterion = nn.CrossEntropyLoss() print(f"✅ Enhanced CNN created successfully") print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}") # Test forward pass batch_size = 32 x = torch.randn(batch_size, 3, 20, 26, device=device) print("Testing forward pass...") outputs = model(x) if isinstance(outputs, tuple): action_probs, extrema_pred, price_pred, features, advanced_pred = outputs print(f"✅ Forward pass successful") print(f" Action probs shape: {action_probs.shape}") print(f" Features shape: {features.shape}") else: print(f"❌ Unexpected output format: {type(outputs)}") return False # Test training step y = torch.randint(0, 3, (batch_size,), device=device) print("Testing training step...") loss = criterion(action_probs, y) optimizer.zero_grad() loss.backward() optimizer.step() print(f"✅ CNN training step successful, loss: {loss.item():.4f}") return True except Exception as e: print(f"❌ Error testing CNN model: {e}") import traceback traceback.print_exc() return False def main(): """Run all tests""" print("=" * 60) print("TESTING GPU TRAINING FUNCTIONALITY") print("=" * 60) results = {} # Test 1: GPU availability results['gpu'] = test_gpu_availability() print() # Test 2: Simple training results['simple_training'] = test_simple_training() print() # Test 3: Our DQN models results['dqn_models'] = test_our_models() print() # Test 4: CNN models results['cnn_models'] = test_cnn_model() print() # Summary print("=" * 60) print("TEST RESULTS SUMMARY") print("=" * 60) for test_name, passed in results.items(): status = "✅ PASS" if passed else "❌ FAIL" print(f"{test_name.upper()}: {status}") all_passed = all(results.values()) if all_passed: print("\n🎉 ALL TESTS PASSED - Your training should work with GPU!") else: print("\n⚠️ SOME TESTS FAILED - Check the issues above") if not results['gpu']: print(" → GPU not available or not working") if not results['simple_training']: print(" → Basic training loop not working") if not results['dqn_models']: print(" → DQN models have issues") if not results['cnn_models']: print(" → CNN models have issues") return 0 if all_passed else 1 if __name__ == "__main__": exit_code = main() sys.exit(exit_code)