scalping dash also works initially
This commit is contained in:
301
test_gpu_training.py
Normal file
301
test_gpu_training.py
Normal file
@ -0,0 +1,301 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test GPU Training - Check if our models actually train and use GPU
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import numpy as np
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def test_gpu_availability():
|
||||
"""Test if GPU is available and working"""
|
||||
logger.info("=== GPU AVAILABILITY TEST ===")
|
||||
|
||||
print(f"PyTorch version: {torch.__version__}")
|
||||
print(f"CUDA available: {torch.cuda.is_available()}")
|
||||
|
||||
if torch.cuda.is_available():
|
||||
print(f"CUDA version: {torch.version.cuda}")
|
||||
print(f"GPU count: {torch.cuda.device_count()}")
|
||||
for i in range(torch.cuda.device_count()):
|
||||
print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
|
||||
print(f" Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")
|
||||
|
||||
# Test GPU operations
|
||||
try:
|
||||
device = torch.device('cuda:0')
|
||||
x = torch.randn(100, 100, device=device)
|
||||
y = torch.randn(100, 100, device=device)
|
||||
z = torch.mm(x, y)
|
||||
print(f"✅ GPU operations working: {z.device}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ GPU operations failed: {e}")
|
||||
return False
|
||||
else:
|
||||
print("❌ No CUDA available")
|
||||
return False
|
||||
|
||||
def test_simple_training():
|
||||
"""Test if a simple neural network actually trains"""
|
||||
logger.info("=== SIMPLE TRAINING TEST ===")
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
print(f"Using device: {device}")
|
||||
|
||||
# Create a simple model
|
||||
class SimpleNet(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layers = nn.Sequential(
|
||||
nn.Linear(10, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, 32),
|
||||
nn.ReLU(),
|
||||
nn.Linear(32, 3)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layers(x)
|
||||
|
||||
model = SimpleNet().to(device)
|
||||
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
# Generate some dummy data
|
||||
X = torch.randn(1000, 10, device=device)
|
||||
y = torch.randint(0, 3, (1000,), device=device)
|
||||
|
||||
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
|
||||
print(f"Data shape: {X.shape}, Labels shape: {y.shape}")
|
||||
|
||||
# Training loop
|
||||
initial_loss = None
|
||||
losses = []
|
||||
|
||||
print("Training for 100 steps...")
|
||||
start_time = time.time()
|
||||
|
||||
for step in range(100):
|
||||
# Forward pass
|
||||
outputs = model(X)
|
||||
loss = criterion(outputs, y)
|
||||
|
||||
# Backward pass
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
loss_val = loss.item()
|
||||
losses.append(loss_val)
|
||||
|
||||
if step == 0:
|
||||
initial_loss = loss_val
|
||||
|
||||
if step % 20 == 0:
|
||||
print(f"Step {step}: Loss = {loss_val:.4f}")
|
||||
|
||||
end_time = time.time()
|
||||
final_loss = losses[-1]
|
||||
|
||||
print(f"Training completed in {end_time - start_time:.2f} seconds")
|
||||
print(f"Initial loss: {initial_loss:.4f}")
|
||||
print(f"Final loss: {final_loss:.4f}")
|
||||
print(f"Loss reduction: {initial_loss - final_loss:.4f}")
|
||||
|
||||
# Check if training actually happened
|
||||
if final_loss < initial_loss * 0.9: # At least 10% reduction
|
||||
print("✅ Training is working - loss decreased significantly")
|
||||
return True
|
||||
else:
|
||||
print("❌ Training may not be working - loss didn't decrease much")
|
||||
return False
|
||||
|
||||
def test_our_models():
|
||||
"""Test if our actual models can train"""
|
||||
logger.info("=== OUR MODELS TEST ===")
|
||||
|
||||
try:
|
||||
# Test DQN Agent
|
||||
from NN.models.dqn_agent import DQNAgent
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
print(f"Testing DQN Agent on {device}")
|
||||
|
||||
# Create agent
|
||||
state_shape = (100,) # Simple state
|
||||
agent = DQNAgent(
|
||||
state_shape=state_shape,
|
||||
n_actions=3,
|
||||
learning_rate=0.001,
|
||||
device=device
|
||||
)
|
||||
|
||||
print(f"✅ DQN Agent created successfully")
|
||||
print(f" Device: {agent.device}")
|
||||
print(f" Policy net device: {next(agent.policy_net.parameters()).device}")
|
||||
|
||||
# Test training step
|
||||
state = np.random.randn(100).astype(np.float32)
|
||||
action = 1
|
||||
reward = 0.5
|
||||
next_state = np.random.randn(100).astype(np.float32)
|
||||
done = False
|
||||
|
||||
# Add experience and train
|
||||
agent.remember(state, action, reward, next_state, done)
|
||||
|
||||
# Add more experiences
|
||||
for _ in range(200): # Need enough for batch
|
||||
s = np.random.randn(100).astype(np.float32)
|
||||
a = np.random.randint(0, 3)
|
||||
r = np.random.randn() * 0.1
|
||||
ns = np.random.randn(100).astype(np.float32)
|
||||
d = np.random.random() < 0.1
|
||||
agent.remember(s, a, r, ns, d)
|
||||
|
||||
# Test training
|
||||
print("Testing training step...")
|
||||
initial_loss = None
|
||||
for i in range(10):
|
||||
loss = agent.replay()
|
||||
if loss > 0:
|
||||
if initial_loss is None:
|
||||
initial_loss = loss
|
||||
print(f" Step {i}: Loss = {loss:.4f}")
|
||||
|
||||
if initial_loss is not None:
|
||||
print("✅ DQN training is working")
|
||||
else:
|
||||
print("❌ DQN training returned no loss")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing our models: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_cnn_model():
|
||||
"""Test CNN model training"""
|
||||
logger.info("=== CNN MODEL TEST ===")
|
||||
|
||||
try:
|
||||
from NN.models.enhanced_cnn import EnhancedCNN
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
print(f"Testing Enhanced CNN on {device}")
|
||||
|
||||
# Create model
|
||||
state_dim = (3, 20, 26) # 3 timeframes, 20 window, 26 features
|
||||
n_actions = 3
|
||||
|
||||
model = EnhancedCNN(state_dim, n_actions).to(device)
|
||||
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
print(f"✅ Enhanced CNN created successfully")
|
||||
print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}")
|
||||
|
||||
# Test forward pass
|
||||
batch_size = 32
|
||||
x = torch.randn(batch_size, 3, 20, 26, device=device)
|
||||
|
||||
print("Testing forward pass...")
|
||||
outputs = model(x)
|
||||
|
||||
if isinstance(outputs, tuple):
|
||||
action_probs, extrema_pred, price_pred, features, advanced_pred = outputs
|
||||
print(f"✅ Forward pass successful")
|
||||
print(f" Action probs shape: {action_probs.shape}")
|
||||
print(f" Features shape: {features.shape}")
|
||||
else:
|
||||
print(f"❌ Unexpected output format: {type(outputs)}")
|
||||
return False
|
||||
|
||||
# Test training step
|
||||
y = torch.randint(0, 3, (batch_size,), device=device)
|
||||
|
||||
print("Testing training step...")
|
||||
loss = criterion(action_probs, y)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
print(f"✅ CNN training step successful, loss: {loss.item():.4f}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing CNN model: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Run all tests"""
|
||||
print("=" * 60)
|
||||
print("TESTING GPU TRAINING FUNCTIONALITY")
|
||||
print("=" * 60)
|
||||
|
||||
results = {}
|
||||
|
||||
# Test 1: GPU availability
|
||||
results['gpu'] = test_gpu_availability()
|
||||
print()
|
||||
|
||||
# Test 2: Simple training
|
||||
results['simple_training'] = test_simple_training()
|
||||
print()
|
||||
|
||||
# Test 3: Our DQN models
|
||||
results['dqn_models'] = test_our_models()
|
||||
print()
|
||||
|
||||
# Test 4: CNN models
|
||||
results['cnn_models'] = test_cnn_model()
|
||||
print()
|
||||
|
||||
# Summary
|
||||
print("=" * 60)
|
||||
print("TEST RESULTS SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
for test_name, passed in results.items():
|
||||
status = "✅ PASS" if passed else "❌ FAIL"
|
||||
print(f"{test_name.upper()}: {status}")
|
||||
|
||||
all_passed = all(results.values())
|
||||
|
||||
if all_passed:
|
||||
print("\n🎉 ALL TESTS PASSED - Your training should work with GPU!")
|
||||
else:
|
||||
print("\n⚠️ SOME TESTS FAILED - Check the issues above")
|
||||
|
||||
if not results['gpu']:
|
||||
print(" → GPU not available or not working")
|
||||
if not results['simple_training']:
|
||||
print(" → Basic training loop not working")
|
||||
if not results['dqn_models']:
|
||||
print(" → DQN models have issues")
|
||||
if not results['cnn_models']:
|
||||
print(" → CNN models have issues")
|
||||
|
||||
return 0 if all_passed else 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit_code = main()
|
||||
sys.exit(exit_code)
|
Reference in New Issue
Block a user