gogo2/training/rl_trainer.py
2025-05-24 10:32:00 +03:00

503 lines
20 KiB
Python

"""
RL Training Pipeline - Scalping Agent Training
Comprehensive training pipeline for scalping RL agents:
- Environment setup and management
- Agent training with experience replay
- Performance tracking and evaluation
- Memory-efficient training loops
"""
import torchimport numpy as npimport pandas as pdimport loggingfrom typing import Dict, List, Tuple, Optional, Anyimport timefrom pathlib import Pathimport matplotlib.pyplot as pltfrom collections import dequeimport randomfrom torch.utils.tensorboard import SummaryWriter# Add project importsimport sysimport ossys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))from core.config import get_configfrom core.data_provider import DataProviderfrom models.rl.scalping_agent import ScalpingEnvironment, ScalpingRLAgentfrom utils.model_utils import robust_save, robust_load
logger = logging.getLogger(__name__)
class RLTrainer:
"""
RL Training Pipeline for Scalping
"""
def __init__(self, data_provider: DataProvider, config: Optional[Dict] = None):
self.data_provider = data_provider
self.config = config or get_config()
# Training parameters
self.num_episodes = 1000
self.max_steps_per_episode = 1000
self.training_frequency = 4 # Train every N steps
self.evaluation_frequency = 50 # Evaluate every N episodes
self.save_frequency = 100 # Save model every N episodes
# Environment parameters
self.symbols = ['ETH/USDT']
self.initial_balance = 1000.0
self.max_position_size = 0.1
# Agent parameters (will be set when we know state dimension)
self.state_dim = None
self.action_dim = 3 # BUY, SELL, HOLD
self.learning_rate = 1e-4
self.memory_size = 50000
# Device
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Training state
self.environment = None
self.agent = None
self.episode_rewards = []
self.episode_lengths = []
self.episode_balances = []
self.episode_trades = []
self.training_losses = []
# Performance tracking
self.best_reward = -float('inf')
self.best_balance = 0.0
self.win_rates = []
self.avg_rewards = []
# TensorBoard setup
self.setup_tensorboard()
logger.info(f"RLTrainer initialized for symbols: {self.symbols}")
def setup_tensorboard(self):
"""Setup TensorBoard logging"""
# Create tensorboard logs directory
log_dir = Path("runs") / f"rl_training_{int(time.time())}"
log_dir.mkdir(parents=True, exist_ok=True)
self.writer = SummaryWriter(log_dir=str(log_dir))
self.tensorboard_dir = log_dir
logger.info(f"TensorBoard logging to: {log_dir}")
logger.info(f"Run: tensorboard --logdir=runs")
def setup_environment_and_agent(self) -> Tuple[ScalpingEnvironment, ScalpingRLAgent]:
"""Setup trading environment and RL agent"""
logger.info("Setting up environment and agent...")
# Create environment
environment = ScalpingEnvironment(
data_provider=self.data_provider,
symbol=self.symbols[0],
initial_balance=self.initial_balance,
max_position_size=self.max_position_size
)
# Get state dimension by resetting environment
initial_state = environment.reset()
if initial_state is None:
raise ValueError("Could not get initial state from environment")
self.state_dim = len(initial_state)
logger.info(f"State dimension: {self.state_dim}")
# Create agent
agent = ScalpingRLAgent(
state_dim=self.state_dim,
action_dim=self.action_dim,
learning_rate=self.learning_rate,
memory_size=self.memory_size
)
return environment, agent
def run_episode(self, episode_num: int, training: bool = True) -> Dict:
"""Run a single episode"""
state = self.environment.reset()
if state is None:
return {'error': 'Could not reset environment'}
episode_reward = 0.0
episode_loss = 0.0
step_count = 0
trades_made = 0
# Episode loop
for step in range(self.max_steps_per_episode):
# Select action
action = self.agent.act(state, training=training)
# Execute action in environment
next_state, reward, done, info = self.environment.step(action, step)
if next_state is None:
break
# Store experience if training
if training:
# Determine if this is a high-priority experience
priority = (abs(reward) > 0.1 or
info.get('trade_info', {}).get('executed', False))
self.agent.remember(state, action, reward, next_state, done, priority)
# Train agent
if step % self.training_frequency == 0 and len(self.agent.memory) > self.agent.batch_size:
loss = self.agent.replay()
if loss is not None:
episode_loss += loss
# Update state
state = next_state
episode_reward += reward
step_count += 1
# Track trades
if info.get('trade_info', {}).get('executed', False):
trades_made += 1
if done:
break
# Episode results
final_balance = info.get('balance', self.initial_balance)
total_fees = info.get('total_fees', 0.0)
episode_results = {
'episode': episode_num,
'reward': episode_reward,
'steps': step_count,
'balance': final_balance,
'trades': trades_made,
'fees': total_fees,
'pnl': final_balance - self.initial_balance,
'pnl_percentage': (final_balance - self.initial_balance) / self.initial_balance * 100,
'avg_loss': episode_loss / max(step_count // self.training_frequency, 1) if training else 0
}
return episode_results
def evaluate_agent(self, num_episodes: int = 10) -> Dict:
"""Evaluate agent performance"""
logger.info(f"Evaluating agent over {num_episodes} episodes...")
evaluation_results = []
total_reward = 0.0
total_balance = 0.0
total_trades = 0
winning_episodes = 0
# Set agent to evaluation mode
original_epsilon = self.agent.epsilon
self.agent.epsilon = 0.0 # No exploration during evaluation
for episode in range(num_episodes):
results = self.run_episode(episode, training=False)
evaluation_results.append(results)
total_reward += results['reward']
total_balance += results['balance']
total_trades += results['trades']
if results['pnl'] > 0:
winning_episodes += 1
# Restore original epsilon
self.agent.epsilon = original_epsilon
# Calculate summary statistics
avg_reward = total_reward / num_episodes
avg_balance = total_balance / num_episodes
avg_trades = total_trades / num_episodes
win_rate = winning_episodes / num_episodes
evaluation_summary = {
'num_episodes': num_episodes,
'avg_reward': avg_reward,
'avg_balance': avg_balance,
'avg_pnl': avg_balance - self.initial_balance,
'avg_pnl_percentage': (avg_balance - self.initial_balance) / self.initial_balance * 100,
'avg_trades': avg_trades,
'win_rate': win_rate,
'results': evaluation_results
}
logger.info(f"Evaluation complete - Avg Reward: {avg_reward:.4f}, Win Rate: {win_rate:.2%}")
return evaluation_summary
def train(self, save_path: Optional[str] = None) -> Dict:
"""Train the RL agent"""
logger.info("Starting RL agent training...")
# Setup environment and agent
self.environment, self.agent = self.setup_environment_and_agent()
# Training state
start_time = time.time()
best_eval_reward = -float('inf')
# Training loop
for episode in range(self.num_episodes):
episode_start_time = time.time()
# Run training episode
results = self.run_episode(episode, training=True)
# Track metrics
self.episode_rewards.append(results['reward'])
self.episode_lengths.append(results['steps'])
self.episode_balances.append(results['balance'])
self.episode_trades.append(results['trades'])
if results.get('avg_loss', 0) > 0:
self.training_losses.append(results['avg_loss'])
# Update best metrics
if results['reward'] > self.best_reward:
self.best_reward = results['reward']
if results['balance'] > self.best_balance:
self.best_balance = results['balance']
# Calculate running averages
recent_rewards = self.episode_rewards[-100:] # Last 100 episodes
recent_balances = self.episode_balances[-100:]
avg_reward = np.mean(recent_rewards)
avg_balance = np.mean(recent_balances)
self.avg_rewards.append(avg_reward)
# Log progress
episode_time = time.time() - episode_start_time
if episode % 10 == 0:
logger.info(
f"Episode {episode}/{self.num_episodes} - "
f"Reward: {results['reward']:.4f}, Balance: ${results['balance']:.2f}, "
f"Trades: {results['trades']}, PnL: {results['pnl_percentage']:.2f}%, "
f"Epsilon: {self.agent.epsilon:.3f}, Time: {episode_time:.2f}s"
)
# Evaluation
if episode % self.evaluation_frequency == 0 and episode > 0:
eval_results = self.evaluate_agent(num_episodes=5)
# Track win rate
self.win_rates.append(eval_results['win_rate'])
logger.info(
f"Evaluation - Avg Reward: {eval_results['avg_reward']:.4f}, "
f"Win Rate: {eval_results['win_rate']:.2%}, "
f"Avg PnL: {eval_results['avg_pnl_percentage']:.2f}%"
)
# Save best model
if eval_results['avg_reward'] > best_eval_reward:
best_eval_reward = eval_results['avg_reward']
if save_path:
best_path = save_path.replace('.pt', '_best.pt')
self.agent.save(best_path)
logger.info(f"New best model saved: {best_path}")
# Save checkpoint
if episode % self.save_frequency == 0 and episode > 0 and save_path:
checkpoint_path = save_path.replace('.pt', f'_checkpoint_{episode}.pt')
self.agent.save(checkpoint_path)
logger.info(f"Checkpoint saved: {checkpoint_path}")
# Training complete
total_time = time.time() - start_time
logger.info(f"Training completed in {total_time:.2f} seconds")
# Final evaluation
final_eval = self.evaluate_agent(num_episodes=20)
# Save final model
if save_path:
self.agent.save(save_path)
logger.info(f"Final model saved: {save_path}")
# Prepare training results
training_results = {
'total_episodes': self.num_episodes,
'total_time': total_time,
'best_reward': self.best_reward,
'best_balance': self.best_balance,
'final_evaluation': final_eval,
'episode_rewards': self.episode_rewards,
'episode_balances': self.episode_balances,
'episode_trades': self.episode_trades,
'training_losses': self.training_losses,
'avg_rewards': self.avg_rewards,
'win_rates': self.win_rates,
'agent_config': {
'state_dim': self.state_dim,
'action_dim': self.action_dim,
'learning_rate': self.learning_rate,
'epsilon_final': self.agent.epsilon
}
}
return training_results
def backtest_agent(self, agent_path: str, test_episodes: int = 50) -> Dict:
"""Backtest trained agent"""
logger.info(f"Backtesting agent from {agent_path}...")
# Setup environment and agent
self.environment, self.agent = self.setup_environment_and_agent()
# Load trained agent
self.agent.load(agent_path)
# Run backtest
backtest_results = self.evaluate_agent(test_episodes)
# Additional analysis
results = backtest_results['results']
pnls = [r['pnl_percentage'] for r in results]
rewards = [r['reward'] for r in results]
trades = [r['trades'] for r in results]
analysis = {
'total_episodes': test_episodes,
'avg_pnl': np.mean(pnls),
'std_pnl': np.std(pnls),
'max_pnl': np.max(pnls),
'min_pnl': np.min(pnls),
'avg_reward': np.mean(rewards),
'avg_trades': np.mean(trades),
'win_rate': backtest_results['win_rate'],
'profit_factor': np.sum([p for p in pnls if p > 0]) / abs(np.sum([p for p in pnls if p < 0])) if any(p < 0 for p in pnls) else float('inf'),
'sharpe_ratio': np.mean(pnls) / np.std(pnls) if np.std(pnls) > 0 else 0,
'max_drawdown': self._calculate_max_drawdown(pnls)
}
logger.info(f"Backtest complete - Win Rate: {analysis['win_rate']:.2%}, Avg PnL: {analysis['avg_pnl']:.2f}%")
return {
'backtest_results': backtest_results,
'analysis': analysis
}
def _calculate_max_drawdown(self, pnls: List[float]) -> float:
"""Calculate maximum drawdown"""
cumulative = np.cumsum(pnls)
running_max = np.maximum.accumulate(cumulative)
drawdowns = running_max - cumulative
return np.max(drawdowns) if len(drawdowns) > 0 else 0.0
def plot_training_progress(self, save_path: Optional[str] = None):
"""Plot training progress"""
if not self.episode_rewards:
logger.warning("No training data to plot")
return
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
episodes = range(1, len(self.episode_rewards) + 1)
# Episode rewards
ax1.plot(episodes, self.episode_rewards, alpha=0.6, label='Episode Reward')
if self.avg_rewards:
ax1.plot(episodes, self.avg_rewards, 'r-', label='Avg Reward (100 episodes)')
ax1.set_title('Training Rewards')
ax1.set_xlabel('Episode')
ax1.set_ylabel('Reward')
ax1.legend()
ax1.grid(True)
# Episode balances
ax2.plot(episodes, self.episode_balances, alpha=0.6, label='Episode Balance')
ax2.axhline(y=self.initial_balance, color='r', linestyle='--', label='Initial Balance')
ax2.set_title('Portfolio Balance')
ax2.set_xlabel('Episode')
ax2.set_ylabel('Balance ($)')
ax2.legend()
ax2.grid(True)
# Training losses
if self.training_losses:
loss_episodes = np.linspace(1, len(self.episode_rewards), len(self.training_losses))
ax3.plot(loss_episodes, self.training_losses, 'g-', alpha=0.8)
ax3.set_title('Training Loss')
ax3.set_xlabel('Episode')
ax3.set_ylabel('Loss')
ax3.grid(True)
# Win rates
if self.win_rates:
eval_episodes = np.arange(self.evaluation_frequency,
len(self.episode_rewards) + 1,
self.evaluation_frequency)[:len(self.win_rates)]
ax4.plot(eval_episodes, self.win_rates, 'purple', marker='o')
ax4.set_title('Win Rate')
ax4.set_xlabel('Episode')
ax4.set_ylabel('Win Rate')
ax4.grid(True)
ax4.set_ylim(0, 1)
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
logger.info(f"Training progress plot saved: {save_path}")
plt.show()
def log_episode_metrics(self, episode: int, metrics: Dict):
"""Log episode metrics to TensorBoard"""
# Main performance metrics
self.writer.add_scalar('Episode/TotalReward', metrics['total_reward'], episode)
self.writer.add_scalar('Episode/FinalBalance', metrics['final_balance'], episode)
self.writer.add_scalar('Episode/TotalReturn', metrics['total_return'], episode)
self.writer.add_scalar('Episode/Steps', metrics['steps'], episode)
# Trading metrics
self.writer.add_scalar('Trading/TotalTrades', metrics['total_trades'], episode)
self.writer.add_scalar('Trading/WinRate', metrics['win_rate'], episode)
self.writer.add_scalar('Trading/ProfitFactor', metrics.get('profit_factor', 0), episode)
self.writer.add_scalar('Trading/MaxDrawdown', metrics.get('max_drawdown', 0), episode)
# Agent metrics
self.writer.add_scalar('Agent/Epsilon', metrics['epsilon'], episode)
self.writer.add_scalar('Agent/LearningRate', metrics.get('learning_rate', self.learning_rate), episode)
self.writer.add_scalar('Agent/MemorySize', metrics.get('memory_size', 0), episode)
# Loss metrics (if available)
if 'loss' in metrics:
self.writer.add_scalar('Agent/Loss', metrics['loss'], episode)
class HybridTrainer:
"""
Hybrid training pipeline combining CNN and RL
"""
def __init__(self, data_provider: DataProvider):
self.data_provider = data_provider
self.cnn_trainer = None
self.rl_trainer = None
def train_hybrid(self, symbols: List[str], cnn_save_path: str, rl_save_path: str) -> Dict:
"""Train CNN first, then RL with CNN features"""
logger.info("Starting hybrid CNN + RL training...")
# Phase 1: Train CNN
logger.info("Phase 1: Training CNN...")
from training.cnn_trainer import CNNTrainer
self.cnn_trainer = CNNTrainer(self.data_provider)
cnn_results = self.cnn_trainer.train(symbols, cnn_save_path)
# Phase 2: Train RL
logger.info("Phase 2: Training RL...")
self.rl_trainer = RLTrainer(self.data_provider)
rl_results = self.rl_trainer.train(rl_save_path)
# Combine results
hybrid_results = {
'cnn_results': cnn_results,
'rl_results': rl_results,
'total_time': cnn_results['total_time'] + rl_results['total_time']
}
logger.info("Hybrid training completed!")
return hybrid_results
# Export
__all__ = ['RLTrainer', 'HybridTrainer']