268 lines
8.6 KiB
Python
268 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Increase GPU Utilization for Training
|
|
|
|
This script provides optimizations to maximize GPU usage during training.
|
|
"""
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import numpy as np
|
|
import logging
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def optimize_training_for_gpu():
|
|
"""Optimize training settings for maximum GPU utilization"""
|
|
|
|
print("🚀 GPU TRAINING OPTIMIZATION GUIDE")
|
|
print("=" * 50)
|
|
|
|
# Check current GPU setup
|
|
if torch.cuda.is_available():
|
|
gpu_name = torch.cuda.get_device_name(0)
|
|
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
|
print(f"GPU: {gpu_name}")
|
|
print(f"VRAM: {gpu_memory:.1f} GB")
|
|
print()
|
|
|
|
# Calculate optimal batch sizes
|
|
print("📊 OPTIMAL BATCH SIZES:")
|
|
print("Current batch sizes:")
|
|
print(" - DQN Agent: 128")
|
|
print(" - CNN Model: 32")
|
|
print()
|
|
|
|
# For RTX 4060 with 8GB VRAM, we can increase batch sizes
|
|
if gpu_memory >= 7.5: # RTX 4060 has ~8GB
|
|
print("🔥 RECOMMENDED OPTIMIZATIONS:")
|
|
print(" 1. Increase DQN batch size: 128 → 256 or 512")
|
|
print(" 2. Increase CNN batch size: 32 → 64 or 128")
|
|
print(" 3. Use larger model variants")
|
|
print(" 4. Enable gradient accumulation")
|
|
print()
|
|
|
|
# Show memory usage estimates
|
|
print("💾 MEMORY USAGE ESTIMATES:")
|
|
print(" - Current DQN (24M params): ~1.5GB")
|
|
print(" - Current CNN (168M params): ~3.2GB")
|
|
print(" - Available for larger batches: ~3GB")
|
|
print()
|
|
|
|
print("⚡ PERFORMANCE OPTIMIZATIONS:")
|
|
print(" 1. ✅ Mixed precision training (already enabled)")
|
|
print(" 2. ✅ GPU tensors (already enabled)")
|
|
print(" 3. 🔧 Increase batch sizes")
|
|
print(" 4. 🔧 Use DataLoader with multiple workers")
|
|
print(" 5. 🔧 Pin memory for faster transfers")
|
|
print(" 6. 🔧 Compile models with torch.compile()")
|
|
print()
|
|
|
|
else:
|
|
print("❌ No GPU available")
|
|
return False
|
|
|
|
return True
|
|
|
|
def create_optimized_training_config():
|
|
"""Create optimized training configuration"""
|
|
|
|
config = {
|
|
# DQN Optimizations
|
|
'dqn': {
|
|
'batch_size': 512, # Increased from 128
|
|
'buffer_size': 100000, # Increased from 20000
|
|
'learning_rate': 0.0003, # Slightly reduced for stability
|
|
'target_update': 10, # More frequent updates
|
|
'gradient_accumulation_steps': 2, # Accumulate gradients
|
|
},
|
|
|
|
# CNN Optimizations
|
|
'cnn': {
|
|
'batch_size': 128, # Increased from 32
|
|
'learning_rate': 0.001,
|
|
'epochs': 200, # More epochs for better learning
|
|
'gradient_accumulation_steps': 4,
|
|
},
|
|
|
|
# Data Loading Optimizations
|
|
'data_loading': {
|
|
'num_workers': 4, # Parallel data loading
|
|
'pin_memory': True, # Faster CPU->GPU transfers
|
|
'persistent_workers': True, # Keep workers alive
|
|
},
|
|
|
|
# GPU Optimizations
|
|
'gpu': {
|
|
'mixed_precision': True,
|
|
'compile_model': True, # Use torch.compile for speed
|
|
'channels_last': True, # Memory layout optimization
|
|
}
|
|
}
|
|
|
|
return config
|
|
|
|
def apply_gpu_optimizations():
|
|
"""Apply GPU optimizations to existing models"""
|
|
|
|
print("🔧 APPLYING GPU OPTIMIZATIONS...")
|
|
print()
|
|
|
|
try:
|
|
# Test optimized DQN training
|
|
from NN.models.dqn_agent import DQNAgent
|
|
|
|
print("1. Testing optimized DQN Agent...")
|
|
|
|
# Create agent with larger batch size
|
|
agent = DQNAgent(
|
|
state_shape=(100,),
|
|
n_actions=3,
|
|
batch_size=512, # Increased batch size
|
|
buffer_size=100000, # Larger memory
|
|
learning_rate=0.0003
|
|
)
|
|
|
|
print(f" ✅ DQN Agent with batch size {agent.batch_size}")
|
|
print(f" ✅ Memory buffer size: {agent.buffer_size:,}")
|
|
|
|
# Test larger batch training
|
|
print(" Testing larger batch training...")
|
|
|
|
# Add many experiences
|
|
for i in range(1000):
|
|
state = np.random.randn(100).astype(np.float32)
|
|
action = np.random.randint(0, 3)
|
|
reward = np.random.randn() * 0.1
|
|
next_state = np.random.randn(100).astype(np.float32)
|
|
done = np.random.random() < 0.1
|
|
agent.remember(state, action, reward, next_state, done)
|
|
|
|
# Train with larger batch
|
|
loss = agent.replay()
|
|
if loss > 0:
|
|
print(f" ✅ Large batch training successful, loss: {loss:.4f}")
|
|
|
|
print()
|
|
|
|
# Test optimized CNN
|
|
from NN.models.enhanced_cnn import EnhancedCNN
|
|
|
|
print("2. Testing optimized CNN...")
|
|
|
|
model = EnhancedCNN((3, 20, 26), 3)
|
|
|
|
# Test larger batch
|
|
batch_size = 128 # Increased from 32
|
|
x = torch.randn(batch_size, 3, 20, 26, device=model.device)
|
|
|
|
print(f" Testing batch size: {batch_size}")
|
|
|
|
# Forward pass
|
|
outputs = model(x)
|
|
if isinstance(outputs, tuple):
|
|
print(f" ✅ Large batch forward pass successful")
|
|
print(f" ✅ Output shape: {outputs[0].shape}")
|
|
|
|
print()
|
|
|
|
# Memory usage check
|
|
if torch.cuda.is_available():
|
|
memory_used = torch.cuda.memory_allocated() / 1024**3
|
|
memory_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
|
memory_percent = (memory_used / memory_total) * 100
|
|
|
|
print(f"📊 GPU Memory Usage:")
|
|
print(f" Used: {memory_used:.2f} GB / {memory_total:.1f} GB ({memory_percent:.1f}%)")
|
|
|
|
if memory_percent < 70:
|
|
print(f" 💡 You can increase batch sizes further!")
|
|
elif memory_percent > 90:
|
|
print(f" ⚠️ Consider reducing batch sizes")
|
|
else:
|
|
print(f" ✅ Good memory utilization")
|
|
|
|
print()
|
|
print("🎉 GPU OPTIMIZATIONS APPLIED SUCCESSFULLY!")
|
|
print()
|
|
print("📝 NEXT STEPS:")
|
|
print(" 1. Update your training scripts with larger batch sizes")
|
|
print(" 2. Use the optimized configurations")
|
|
print(" 3. Monitor GPU utilization during training")
|
|
print(" 4. Adjust batch sizes based on memory usage")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error applying optimizations: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def monitor_gpu_during_training():
|
|
"""Show how to monitor GPU during training"""
|
|
|
|
print("📊 GPU MONITORING DURING TRAINING")
|
|
print("=" * 40)
|
|
print()
|
|
print("Use these commands to monitor GPU utilization:")
|
|
print()
|
|
print("1. NVIDIA System Management Interface:")
|
|
print(" nvidia-smi -l 1")
|
|
print(" (Updates every 1 second)")
|
|
print()
|
|
print("2. Continuous monitoring:")
|
|
print(" watch -n 1 nvidia-smi")
|
|
print()
|
|
print("3. Python GPU monitoring:")
|
|
print(" python -c \"import GPUtil; GPUtil.showUtilization()\"")
|
|
print()
|
|
print("4. Memory monitoring in your training script:")
|
|
print(" if torch.cuda.is_available():")
|
|
print(" print(f'GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB')")
|
|
print()
|
|
|
|
def main():
|
|
"""Main optimization function"""
|
|
|
|
print("🚀 GPU TRAINING OPTIMIZATION TOOL")
|
|
print("=" * 50)
|
|
print()
|
|
|
|
# Check GPU setup
|
|
if not optimize_training_for_gpu():
|
|
return 1
|
|
|
|
# Show optimized config
|
|
config = create_optimized_training_config()
|
|
print("⚙️ OPTIMIZED CONFIGURATION:")
|
|
for section, settings in config.items():
|
|
print(f" {section.upper()}:")
|
|
for key, value in settings.items():
|
|
print(f" {key}: {value}")
|
|
print()
|
|
|
|
# Apply optimizations
|
|
if not apply_gpu_optimizations():
|
|
return 1
|
|
|
|
# Show monitoring info
|
|
monitor_gpu_during_training()
|
|
|
|
print("✅ OPTIMIZATION COMPLETE!")
|
|
print()
|
|
print("Your training is working correctly with GPU!")
|
|
print("Use the optimizations above to increase GPU utilization.")
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
exit_code = main()
|
|
sys.exit(exit_code) |