""" RL Training Pipeline - Scalping Agent Training Comprehensive training pipeline for scalping RL agents: - Environment setup and management - Agent training with experience replay - Performance tracking and evaluation - Memory-efficient training loops """ import torch import numpy as np import pandas as pd import logging from typing import Dict, List, Tuple, Optional, Any import time from pathlib import Path import matplotlib.pyplot as plt from collections import deque import random from torch.utils.tensorboard import SummaryWriter # Add project imports import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from core.config import get_config from core.data_provider import DataProvider from models.rl.scalping_agent import ScalpingEnvironment, ScalpingRLAgent from utils.model_utils import robust_save, robust_load logger = logging.getLogger(__name__) class RLTrainer: """ RL Training Pipeline for Scalping """ def __init__(self, data_provider: DataProvider, config: Optional[Dict] = None): self.data_provider = data_provider self.config = config or get_config() # Training parameters self.num_episodes = 1000 self.max_steps_per_episode = 1000 self.training_frequency = 4 # Train every N steps self.evaluation_frequency = 50 # Evaluate every N episodes self.save_frequency = 100 # Save model every N episodes # Environment parameters self.symbols = ['ETH/USDT'] self.initial_balance = 1000.0 self.max_position_size = 0.1 # Agent parameters (will be set when we know state dimension) self.state_dim = None self.action_dim = 3 # BUY, SELL, HOLD self.learning_rate = 1e-4 self.memory_size = 50000 # Device self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Training state self.environment = None self.agent = None self.episode_rewards = [] self.episode_lengths = [] self.episode_balances = [] self.episode_trades = [] self.training_losses = [] # Performance tracking self.best_reward = -float('inf') self.best_balance = 0.0 self.win_rates = [] self.avg_rewards = [] # TensorBoard setup self.setup_tensorboard() logger.info(f"RLTrainer initialized for symbols: {self.symbols}") def setup_tensorboard(self): """Setup TensorBoard logging""" # Create tensorboard logs directory log_dir = Path("runs") / f"rl_training_{int(time.time())}" log_dir.mkdir(parents=True, exist_ok=True) self.writer = SummaryWriter(log_dir=str(log_dir)) self.tensorboard_dir = log_dir logger.info(f"TensorBoard logging to: {log_dir}") logger.info(f"Run: tensorboard --logdir=runs") def setup_environment_and_agent(self) -> Tuple[ScalpingEnvironment, ScalpingRLAgent]: """Setup trading environment and RL agent""" logger.info("Setting up environment and agent...") # Create environment environment = ScalpingEnvironment( data_provider=self.data_provider, symbol=self.symbols[0], initial_balance=self.initial_balance, max_position_size=self.max_position_size ) # Get state dimension by resetting environment initial_state = environment.reset() if initial_state is None: raise ValueError("Could not get initial state from environment") self.state_dim = len(initial_state) logger.info(f"State dimension: {self.state_dim}") # Create agent agent = ScalpingRLAgent( state_dim=self.state_dim, action_dim=self.action_dim, learning_rate=self.learning_rate, memory_size=self.memory_size ) return environment, agent def run_episode(self, episode_num: int, training: bool = True) -> Dict: """Run a single episode""" state = self.environment.reset() if state is None: return {'error': 'Could not reset environment'} episode_reward = 0.0 episode_loss = 0.0 step_count = 0 trades_made = 0 # Episode loop for step in range(self.max_steps_per_episode): # Select action action = self.agent.act(state, training=training) # Execute action in environment next_state, reward, done, info = self.environment.step(action, step) if next_state is None: break # Store experience if training if training: # Determine if this is a high-priority experience priority = (abs(reward) > 0.1 or info.get('trade_info', {}).get('executed', False)) self.agent.remember(state, action, reward, next_state, done, priority) # Train agent if step % self.training_frequency == 0 and len(self.agent.memory) > self.agent.batch_size: loss = self.agent.replay() if loss is not None: episode_loss += loss # Update state state = next_state episode_reward += reward step_count += 1 # Track trades if info.get('trade_info', {}).get('executed', False): trades_made += 1 if done: break # Episode results final_balance = info.get('balance', self.initial_balance) total_fees = info.get('total_fees', 0.0) episode_results = { 'episode': episode_num, 'reward': episode_reward, 'steps': step_count, 'balance': final_balance, 'trades': trades_made, 'fees': total_fees, 'pnl': final_balance - self.initial_balance, 'pnl_percentage': (final_balance - self.initial_balance) / self.initial_balance * 100, 'avg_loss': episode_loss / max(step_count // self.training_frequency, 1) if training else 0 } return episode_results def evaluate_agent(self, num_episodes: int = 10) -> Dict: """Evaluate agent performance""" logger.info(f"Evaluating agent over {num_episodes} episodes...") evaluation_results = [] total_reward = 0.0 total_balance = 0.0 total_trades = 0 winning_episodes = 0 # Set agent to evaluation mode original_epsilon = self.agent.epsilon self.agent.epsilon = 0.0 # No exploration during evaluation for episode in range(num_episodes): results = self.run_episode(episode, training=False) evaluation_results.append(results) total_reward += results['reward'] total_balance += results['balance'] total_trades += results['trades'] if results['pnl'] > 0: winning_episodes += 1 # Restore original epsilon self.agent.epsilon = original_epsilon # Calculate summary statistics avg_reward = total_reward / num_episodes avg_balance = total_balance / num_episodes avg_trades = total_trades / num_episodes win_rate = winning_episodes / num_episodes evaluation_summary = { 'num_episodes': num_episodes, 'avg_reward': avg_reward, 'avg_balance': avg_balance, 'avg_pnl': avg_balance - self.initial_balance, 'avg_pnl_percentage': (avg_balance - self.initial_balance) / self.initial_balance * 100, 'avg_trades': avg_trades, 'win_rate': win_rate, 'results': evaluation_results } logger.info(f"Evaluation complete - Avg Reward: {avg_reward:.4f}, Win Rate: {win_rate:.2%}") return evaluation_summary def train(self, save_path: Optional[str] = None) -> Dict: """Train the RL agent""" logger.info("Starting RL agent training...") # Setup environment and agent self.environment, self.agent = self.setup_environment_and_agent() # Training state start_time = time.time() best_eval_reward = -float('inf') # Training loop for episode in range(self.num_episodes): episode_start_time = time.time() # Run training episode results = self.run_episode(episode, training=True) # Track metrics self.episode_rewards.append(results['reward']) self.episode_lengths.append(results['steps']) self.episode_balances.append(results['balance']) self.episode_trades.append(results['trades']) if results.get('avg_loss', 0) > 0: self.training_losses.append(results['avg_loss']) # Update best metrics if results['reward'] > self.best_reward: self.best_reward = results['reward'] if results['balance'] > self.best_balance: self.best_balance = results['balance'] # Calculate running averages recent_rewards = self.episode_rewards[-100:] # Last 100 episodes recent_balances = self.episode_balances[-100:] avg_reward = np.mean(recent_rewards) avg_balance = np.mean(recent_balances) self.avg_rewards.append(avg_reward) # Log progress episode_time = time.time() - episode_start_time if episode % 10 == 0: logger.info( f"Episode {episode}/{self.num_episodes} - " f"Reward: {results['reward']:.4f}, Balance: ${results['balance']:.2f}, " f"Trades: {results['trades']}, PnL: {results['pnl_percentage']:.2f}%, " f"Epsilon: {self.agent.epsilon:.3f}, Time: {episode_time:.2f}s" ) # Evaluation if episode % self.evaluation_frequency == 0 and episode > 0: eval_results = self.evaluate_agent(num_episodes=5) # Track win rate self.win_rates.append(eval_results['win_rate']) logger.info( f"Evaluation - Avg Reward: {eval_results['avg_reward']:.4f}, " f"Win Rate: {eval_results['win_rate']:.2%}, " f"Avg PnL: {eval_results['avg_pnl_percentage']:.2f}%" ) # Save best model if eval_results['avg_reward'] > best_eval_reward: best_eval_reward = eval_results['avg_reward'] if save_path: best_path = save_path.replace('.pt', '_best.pt') self.agent.save(best_path) logger.info(f"New best model saved: {best_path}") # Save checkpoint if episode % self.save_frequency == 0 and episode > 0 and save_path: checkpoint_path = save_path.replace('.pt', f'_checkpoint_{episode}.pt') self.agent.save(checkpoint_path) logger.info(f"Checkpoint saved: {checkpoint_path}") # Training complete total_time = time.time() - start_time logger.info(f"Training completed in {total_time:.2f} seconds") # Final evaluation final_eval = self.evaluate_agent(num_episodes=20) # Save final model if save_path: self.agent.save(save_path) logger.info(f"Final model saved: {save_path}") # Prepare training results training_results = { 'total_episodes': self.num_episodes, 'total_time': total_time, 'best_reward': self.best_reward, 'best_balance': self.best_balance, 'final_evaluation': final_eval, 'episode_rewards': self.episode_rewards, 'episode_balances': self.episode_balances, 'episode_trades': self.episode_trades, 'training_losses': self.training_losses, 'avg_rewards': self.avg_rewards, 'win_rates': self.win_rates, 'agent_config': { 'state_dim': self.state_dim, 'action_dim': self.action_dim, 'learning_rate': self.learning_rate, 'epsilon_final': self.agent.epsilon } } return training_results def backtest_agent(self, agent_path: str, test_episodes: int = 50) -> Dict: """Backtest trained agent""" logger.info(f"Backtesting agent from {agent_path}...") # Setup environment and agent self.environment, self.agent = self.setup_environment_and_agent() # Load trained agent self.agent.load(agent_path) # Run backtest backtest_results = self.evaluate_agent(test_episodes) # Additional analysis results = backtest_results['results'] pnls = [r['pnl_percentage'] for r in results] rewards = [r['reward'] for r in results] trades = [r['trades'] for r in results] analysis = { 'total_episodes': test_episodes, 'avg_pnl': np.mean(pnls), 'std_pnl': np.std(pnls), 'max_pnl': np.max(pnls), 'min_pnl': np.min(pnls), 'avg_reward': np.mean(rewards), 'avg_trades': np.mean(trades), 'win_rate': backtest_results['win_rate'], 'profit_factor': np.sum([p for p in pnls if p > 0]) / abs(np.sum([p for p in pnls if p < 0])) if any(p < 0 for p in pnls) else float('inf'), 'sharpe_ratio': np.mean(pnls) / np.std(pnls) if np.std(pnls) > 0 else 0, 'max_drawdown': self._calculate_max_drawdown(pnls) } logger.info(f"Backtest complete - Win Rate: {analysis['win_rate']:.2%}, Avg PnL: {analysis['avg_pnl']:.2f}%") return { 'backtest_results': backtest_results, 'analysis': analysis } def _calculate_max_drawdown(self, pnls: List[float]) -> float: """Calculate maximum drawdown""" cumulative = np.cumsum(pnls) running_max = np.maximum.accumulate(cumulative) drawdowns = running_max - cumulative return np.max(drawdowns) if len(drawdowns) > 0 else 0.0 def plot_training_progress(self, save_path: Optional[str] = None): """Plot training progress""" if not self.episode_rewards: logger.warning("No training data to plot") return fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10)) episodes = range(1, len(self.episode_rewards) + 1) # Episode rewards ax1.plot(episodes, self.episode_rewards, alpha=0.6, label='Episode Reward') if self.avg_rewards: ax1.plot(episodes, self.avg_rewards, 'r-', label='Avg Reward (100 episodes)') ax1.set_title('Training Rewards') ax1.set_xlabel('Episode') ax1.set_ylabel('Reward') ax1.legend() ax1.grid(True) # Episode balances ax2.plot(episodes, self.episode_balances, alpha=0.6, label='Episode Balance') ax2.axhline(y=self.initial_balance, color='r', linestyle='--', label='Initial Balance') ax2.set_title('Portfolio Balance') ax2.set_xlabel('Episode') ax2.set_ylabel('Balance ($)') ax2.legend() ax2.grid(True) # Training losses if self.training_losses: loss_episodes = np.linspace(1, len(self.episode_rewards), len(self.training_losses)) ax3.plot(loss_episodes, self.training_losses, 'g-', alpha=0.8) ax3.set_title('Training Loss') ax3.set_xlabel('Episode') ax3.set_ylabel('Loss') ax3.grid(True) # Win rates if self.win_rates: eval_episodes = np.arange(self.evaluation_frequency, len(self.episode_rewards) + 1, self.evaluation_frequency)[:len(self.win_rates)] ax4.plot(eval_episodes, self.win_rates, 'purple', marker='o') ax4.set_title('Win Rate') ax4.set_xlabel('Episode') ax4.set_ylabel('Win Rate') ax4.grid(True) ax4.set_ylim(0, 1) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') logger.info(f"Training progress plot saved: {save_path}") plt.show() def log_episode_metrics(self, episode: int, metrics: Dict): """Log episode metrics to TensorBoard""" # Main performance metrics self.writer.add_scalar('Episode/TotalReward', metrics['total_reward'], episode) self.writer.add_scalar('Episode/FinalBalance', metrics['final_balance'], episode) self.writer.add_scalar('Episode/TotalReturn', metrics['total_return'], episode) self.writer.add_scalar('Episode/Steps', metrics['steps'], episode) # Trading metrics self.writer.add_scalar('Trading/TotalTrades', metrics['total_trades'], episode) self.writer.add_scalar('Trading/WinRate', metrics['win_rate'], episode) self.writer.add_scalar('Trading/ProfitFactor', metrics.get('profit_factor', 0), episode) self.writer.add_scalar('Trading/MaxDrawdown', metrics.get('max_drawdown', 0), episode) # Agent metrics self.writer.add_scalar('Agent/Epsilon', metrics['epsilon'], episode) self.writer.add_scalar('Agent/LearningRate', metrics.get('learning_rate', self.learning_rate), episode) self.writer.add_scalar('Agent/MemorySize', metrics.get('memory_size', 0), episode) # Loss metrics (if available) if 'loss' in metrics: self.writer.add_scalar('Agent/Loss', metrics['loss'], episode) class HybridTrainer: """ Hybrid training pipeline combining CNN and RL """ def __init__(self, data_provider: DataProvider): self.data_provider = data_provider self.cnn_trainer = None self.rl_trainer = None def train_hybrid(self, symbols: List[str], cnn_save_path: str, rl_save_path: str) -> Dict: """Train CNN first, then RL with CNN features""" logger.info("Starting hybrid CNN + RL training...") # Phase 1: Train CNN logger.info("Phase 1: Training CNN...") from training.cnn_trainer import CNNTrainer self.cnn_trainer = CNNTrainer(self.data_provider) cnn_results = self.cnn_trainer.train(symbols, cnn_save_path) # Phase 2: Train RL logger.info("Phase 2: Training RL...") self.rl_trainer = RLTrainer(self.data_provider) rl_results = self.rl_trainer.train(rl_save_path) # Combine results hybrid_results = { 'cnn_results': cnn_results, 'rl_results': rl_results, 'total_time': cnn_results['total_time'] + rl_results['total_time'] } logger.info("Hybrid training completed!") return hybrid_results # Export __all__ = ['RLTrainer', 'HybridTrainer']