522 lines
20 KiB
Python
522 lines
20 KiB
Python
"""
|
|
RL Training Pipeline - Scalping Agent Training
|
|
|
|
Comprehensive training pipeline for scalping RL agents:
|
|
- Environment setup and management
|
|
- Agent training with experience replay
|
|
- Performance tracking and evaluation
|
|
- Memory-efficient training loops
|
|
"""
|
|
|
|
import torch
|
|
import numpy as np
|
|
import pandas as pd
|
|
import logging
|
|
from typing import Dict, List, Tuple, Optional, Any
|
|
import time
|
|
from pathlib import Path
|
|
import matplotlib.pyplot as plt
|
|
from collections import deque
|
|
import random
|
|
from torch.utils.tensorboard import SummaryWriter
|
|
|
|
# Add project imports
|
|
import sys
|
|
import os
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from core.config import get_config
|
|
from core.data_provider import DataProvider
|
|
from models.rl.scalping_agent import ScalpingEnvironment, ScalpingRLAgent
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class RLTrainer:
|
|
"""
|
|
RL Training Pipeline for Scalping
|
|
"""
|
|
|
|
def __init__(self, data_provider: DataProvider, config: Optional[Dict] = None):
|
|
self.data_provider = data_provider
|
|
self.config = config or get_config()
|
|
|
|
# Training parameters
|
|
self.num_episodes = 1000
|
|
self.max_steps_per_episode = 1000
|
|
self.training_frequency = 4 # Train every N steps
|
|
self.evaluation_frequency = 50 # Evaluate every N episodes
|
|
self.save_frequency = 100 # Save model every N episodes
|
|
|
|
# Environment parameters
|
|
self.symbols = ['ETH/USDT']
|
|
self.initial_balance = 1000.0
|
|
self.max_position_size = 0.1
|
|
|
|
# Agent parameters (will be set when we know state dimension)
|
|
self.state_dim = None
|
|
self.action_dim = 3 # BUY, SELL, HOLD
|
|
self.learning_rate = 1e-4
|
|
self.memory_size = 50000
|
|
|
|
# Device
|
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
# Training state
|
|
self.environment = None
|
|
self.agent = None
|
|
self.episode_rewards = []
|
|
self.episode_lengths = []
|
|
self.episode_balances = []
|
|
self.episode_trades = []
|
|
self.training_losses = []
|
|
|
|
# Performance tracking
|
|
self.best_reward = -float('inf')
|
|
self.best_balance = 0.0
|
|
self.win_rates = []
|
|
self.avg_rewards = []
|
|
|
|
# TensorBoard setup
|
|
self.setup_tensorboard()
|
|
|
|
logger.info(f"RLTrainer initialized for symbols: {self.symbols}")
|
|
|
|
def setup_tensorboard(self):
|
|
"""Setup TensorBoard logging"""
|
|
# Create tensorboard logs directory
|
|
log_dir = Path("runs") / f"rl_training_{int(time.time())}"
|
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
self.writer = SummaryWriter(log_dir=str(log_dir))
|
|
self.tensorboard_dir = log_dir
|
|
|
|
logger.info(f"TensorBoard logging to: {log_dir}")
|
|
logger.info(f"Run: tensorboard --logdir=runs")
|
|
|
|
def setup_environment_and_agent(self) -> Tuple[ScalpingEnvironment, ScalpingRLAgent]:
|
|
"""Setup trading environment and RL agent"""
|
|
logger.info("Setting up environment and agent...")
|
|
|
|
# Create environment
|
|
environment = ScalpingEnvironment(
|
|
data_provider=self.data_provider,
|
|
symbol=self.symbols[0],
|
|
initial_balance=self.initial_balance,
|
|
max_position_size=self.max_position_size
|
|
)
|
|
|
|
# Get state dimension by resetting environment
|
|
initial_state = environment.reset()
|
|
if initial_state is None:
|
|
raise ValueError("Could not get initial state from environment")
|
|
|
|
self.state_dim = len(initial_state)
|
|
logger.info(f"State dimension: {self.state_dim}")
|
|
|
|
# Create agent
|
|
agent = ScalpingRLAgent(
|
|
state_dim=self.state_dim,
|
|
action_dim=self.action_dim,
|
|
learning_rate=self.learning_rate,
|
|
memory_size=self.memory_size
|
|
)
|
|
|
|
return environment, agent
|
|
|
|
def run_episode(self, episode_num: int, training: bool = True) -> Dict:
|
|
"""Run a single episode"""
|
|
state = self.environment.reset()
|
|
if state is None:
|
|
return {'error': 'Could not reset environment'}
|
|
|
|
episode_reward = 0.0
|
|
episode_loss = 0.0
|
|
step_count = 0
|
|
trades_made = 0
|
|
|
|
# Episode loop
|
|
for step in range(self.max_steps_per_episode):
|
|
# Select action
|
|
action = self.agent.act(state, training=training)
|
|
|
|
# Execute action in environment
|
|
next_state, reward, done, info = self.environment.step(action, step)
|
|
|
|
if next_state is None:
|
|
break
|
|
|
|
# Store experience if training
|
|
if training:
|
|
# Determine if this is a high-priority experience
|
|
priority = (abs(reward) > 0.1 or
|
|
info.get('trade_info', {}).get('executed', False))
|
|
|
|
self.agent.remember(state, action, reward, next_state, done, priority)
|
|
|
|
# Train agent
|
|
if step % self.training_frequency == 0 and len(self.agent.memory) > self.agent.batch_size:
|
|
loss = self.agent.replay()
|
|
if loss is not None:
|
|
episode_loss += loss
|
|
|
|
# Update state
|
|
state = next_state
|
|
episode_reward += reward
|
|
step_count += 1
|
|
|
|
# Track trades
|
|
if info.get('trade_info', {}).get('executed', False):
|
|
trades_made += 1
|
|
|
|
if done:
|
|
break
|
|
|
|
# Episode results
|
|
final_balance = info.get('balance', self.initial_balance)
|
|
total_fees = info.get('total_fees', 0.0)
|
|
|
|
episode_results = {
|
|
'episode': episode_num,
|
|
'reward': episode_reward,
|
|
'steps': step_count,
|
|
'balance': final_balance,
|
|
'trades': trades_made,
|
|
'fees': total_fees,
|
|
'pnl': final_balance - self.initial_balance,
|
|
'pnl_percentage': (final_balance - self.initial_balance) / self.initial_balance * 100,
|
|
'avg_loss': episode_loss / max(step_count // self.training_frequency, 1) if training else 0
|
|
}
|
|
|
|
return episode_results
|
|
|
|
def evaluate_agent(self, num_episodes: int = 10) -> Dict:
|
|
"""Evaluate agent performance"""
|
|
logger.info(f"Evaluating agent over {num_episodes} episodes...")
|
|
|
|
evaluation_results = []
|
|
total_reward = 0.0
|
|
total_balance = 0.0
|
|
total_trades = 0
|
|
winning_episodes = 0
|
|
|
|
# Set agent to evaluation mode
|
|
original_epsilon = self.agent.epsilon
|
|
self.agent.epsilon = 0.0 # No exploration during evaluation
|
|
|
|
for episode in range(num_episodes):
|
|
results = self.run_episode(episode, training=False)
|
|
evaluation_results.append(results)
|
|
|
|
total_reward += results['reward']
|
|
total_balance += results['balance']
|
|
total_trades += results['trades']
|
|
|
|
if results['pnl'] > 0:
|
|
winning_episodes += 1
|
|
|
|
# Restore original epsilon
|
|
self.agent.epsilon = original_epsilon
|
|
|
|
# Calculate summary statistics
|
|
avg_reward = total_reward / num_episodes
|
|
avg_balance = total_balance / num_episodes
|
|
avg_trades = total_trades / num_episodes
|
|
win_rate = winning_episodes / num_episodes
|
|
|
|
evaluation_summary = {
|
|
'num_episodes': num_episodes,
|
|
'avg_reward': avg_reward,
|
|
'avg_balance': avg_balance,
|
|
'avg_pnl': avg_balance - self.initial_balance,
|
|
'avg_pnl_percentage': (avg_balance - self.initial_balance) / self.initial_balance * 100,
|
|
'avg_trades': avg_trades,
|
|
'win_rate': win_rate,
|
|
'results': evaluation_results
|
|
}
|
|
|
|
logger.info(f"Evaluation complete - Avg Reward: {avg_reward:.4f}, Win Rate: {win_rate:.2%}")
|
|
|
|
return evaluation_summary
|
|
|
|
def train(self, save_path: Optional[str] = None) -> Dict:
|
|
"""Train the RL agent"""
|
|
logger.info("Starting RL agent training...")
|
|
|
|
# Setup environment and agent
|
|
self.environment, self.agent = self.setup_environment_and_agent()
|
|
|
|
# Training state
|
|
start_time = time.time()
|
|
best_eval_reward = -float('inf')
|
|
|
|
# Training loop
|
|
for episode in range(self.num_episodes):
|
|
episode_start_time = time.time()
|
|
|
|
# Run training episode
|
|
results = self.run_episode(episode, training=True)
|
|
|
|
# Track metrics
|
|
self.episode_rewards.append(results['reward'])
|
|
self.episode_lengths.append(results['steps'])
|
|
self.episode_balances.append(results['balance'])
|
|
self.episode_trades.append(results['trades'])
|
|
|
|
if results.get('avg_loss', 0) > 0:
|
|
self.training_losses.append(results['avg_loss'])
|
|
|
|
# Update best metrics
|
|
if results['reward'] > self.best_reward:
|
|
self.best_reward = results['reward']
|
|
|
|
if results['balance'] > self.best_balance:
|
|
self.best_balance = results['balance']
|
|
|
|
# Calculate running averages
|
|
recent_rewards = self.episode_rewards[-100:] # Last 100 episodes
|
|
recent_balances = self.episode_balances[-100:]
|
|
|
|
avg_reward = np.mean(recent_rewards)
|
|
avg_balance = np.mean(recent_balances)
|
|
|
|
self.avg_rewards.append(avg_reward)
|
|
|
|
# Log progress
|
|
episode_time = time.time() - episode_start_time
|
|
|
|
if episode % 10 == 0:
|
|
logger.info(
|
|
f"Episode {episode}/{self.num_episodes} - "
|
|
f"Reward: {results['reward']:.4f}, Balance: ${results['balance']:.2f}, "
|
|
f"Trades: {results['trades']}, PnL: {results['pnl_percentage']:.2f}%, "
|
|
f"Epsilon: {self.agent.epsilon:.3f}, Time: {episode_time:.2f}s"
|
|
)
|
|
|
|
# Evaluation
|
|
if episode % self.evaluation_frequency == 0 and episode > 0:
|
|
eval_results = self.evaluate_agent(num_episodes=5)
|
|
|
|
# Track win rate
|
|
self.win_rates.append(eval_results['win_rate'])
|
|
|
|
logger.info(
|
|
f"Evaluation - Avg Reward: {eval_results['avg_reward']:.4f}, "
|
|
f"Win Rate: {eval_results['win_rate']:.2%}, "
|
|
f"Avg PnL: {eval_results['avg_pnl_percentage']:.2f}%"
|
|
)
|
|
|
|
# Save best model
|
|
if eval_results['avg_reward'] > best_eval_reward:
|
|
best_eval_reward = eval_results['avg_reward']
|
|
if save_path:
|
|
best_path = save_path.replace('.pt', '_best.pt')
|
|
self.agent.save(best_path)
|
|
logger.info(f"New best model saved: {best_path}")
|
|
|
|
# Save checkpoint
|
|
if episode % self.save_frequency == 0 and episode > 0 and save_path:
|
|
checkpoint_path = save_path.replace('.pt', f'_checkpoint_{episode}.pt')
|
|
self.agent.save(checkpoint_path)
|
|
logger.info(f"Checkpoint saved: {checkpoint_path}")
|
|
|
|
# Training complete
|
|
total_time = time.time() - start_time
|
|
logger.info(f"Training completed in {total_time:.2f} seconds")
|
|
|
|
# Final evaluation
|
|
final_eval = self.evaluate_agent(num_episodes=20)
|
|
|
|
# Save final model
|
|
if save_path:
|
|
self.agent.save(save_path)
|
|
logger.info(f"Final model saved: {save_path}")
|
|
|
|
# Prepare training results
|
|
training_results = {
|
|
'total_episodes': self.num_episodes,
|
|
'total_time': total_time,
|
|
'best_reward': self.best_reward,
|
|
'best_balance': self.best_balance,
|
|
'final_evaluation': final_eval,
|
|
'episode_rewards': self.episode_rewards,
|
|
'episode_balances': self.episode_balances,
|
|
'episode_trades': self.episode_trades,
|
|
'training_losses': self.training_losses,
|
|
'avg_rewards': self.avg_rewards,
|
|
'win_rates': self.win_rates,
|
|
'agent_config': {
|
|
'state_dim': self.state_dim,
|
|
'action_dim': self.action_dim,
|
|
'learning_rate': self.learning_rate,
|
|
'epsilon_final': self.agent.epsilon
|
|
}
|
|
}
|
|
|
|
return training_results
|
|
|
|
def backtest_agent(self, agent_path: str, test_episodes: int = 50) -> Dict:
|
|
"""Backtest trained agent"""
|
|
logger.info(f"Backtesting agent from {agent_path}...")
|
|
|
|
# Setup environment and agent
|
|
self.environment, self.agent = self.setup_environment_and_agent()
|
|
|
|
# Load trained agent
|
|
self.agent.load(agent_path)
|
|
|
|
# Run backtest
|
|
backtest_results = self.evaluate_agent(test_episodes)
|
|
|
|
# Additional analysis
|
|
results = backtest_results['results']
|
|
pnls = [r['pnl_percentage'] for r in results]
|
|
rewards = [r['reward'] for r in results]
|
|
trades = [r['trades'] for r in results]
|
|
|
|
analysis = {
|
|
'total_episodes': test_episodes,
|
|
'avg_pnl': np.mean(pnls),
|
|
'std_pnl': np.std(pnls),
|
|
'max_pnl': np.max(pnls),
|
|
'min_pnl': np.min(pnls),
|
|
'avg_reward': np.mean(rewards),
|
|
'avg_trades': np.mean(trades),
|
|
'win_rate': backtest_results['win_rate'],
|
|
'profit_factor': np.sum([p for p in pnls if p > 0]) / abs(np.sum([p for p in pnls if p < 0])) if any(p < 0 for p in pnls) else float('inf'),
|
|
'sharpe_ratio': np.mean(pnls) / np.std(pnls) if np.std(pnls) > 0 else 0,
|
|
'max_drawdown': self._calculate_max_drawdown(pnls)
|
|
}
|
|
|
|
logger.info(f"Backtest complete - Win Rate: {analysis['win_rate']:.2%}, Avg PnL: {analysis['avg_pnl']:.2f}%")
|
|
|
|
return {
|
|
'backtest_results': backtest_results,
|
|
'analysis': analysis
|
|
}
|
|
|
|
def _calculate_max_drawdown(self, pnls: List[float]) -> float:
|
|
"""Calculate maximum drawdown"""
|
|
cumulative = np.cumsum(pnls)
|
|
running_max = np.maximum.accumulate(cumulative)
|
|
drawdowns = running_max - cumulative
|
|
return np.max(drawdowns) if len(drawdowns) > 0 else 0.0
|
|
|
|
def plot_training_progress(self, save_path: Optional[str] = None):
|
|
"""Plot training progress"""
|
|
if not self.episode_rewards:
|
|
logger.warning("No training data to plot")
|
|
return
|
|
|
|
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
|
|
|
|
episodes = range(1, len(self.episode_rewards) + 1)
|
|
|
|
# Episode rewards
|
|
ax1.plot(episodes, self.episode_rewards, alpha=0.6, label='Episode Reward')
|
|
if self.avg_rewards:
|
|
ax1.plot(episodes, self.avg_rewards, 'r-', label='Avg Reward (100 episodes)')
|
|
ax1.set_title('Training Rewards')
|
|
ax1.set_xlabel('Episode')
|
|
ax1.set_ylabel('Reward')
|
|
ax1.legend()
|
|
ax1.grid(True)
|
|
|
|
# Episode balances
|
|
ax2.plot(episodes, self.episode_balances, alpha=0.6, label='Episode Balance')
|
|
ax2.axhline(y=self.initial_balance, color='r', linestyle='--', label='Initial Balance')
|
|
ax2.set_title('Portfolio Balance')
|
|
ax2.set_xlabel('Episode')
|
|
ax2.set_ylabel('Balance ($)')
|
|
ax2.legend()
|
|
ax2.grid(True)
|
|
|
|
# Training losses
|
|
if self.training_losses:
|
|
loss_episodes = np.linspace(1, len(self.episode_rewards), len(self.training_losses))
|
|
ax3.plot(loss_episodes, self.training_losses, 'g-', alpha=0.8)
|
|
ax3.set_title('Training Loss')
|
|
ax3.set_xlabel('Episode')
|
|
ax3.set_ylabel('Loss')
|
|
ax3.grid(True)
|
|
|
|
# Win rates
|
|
if self.win_rates:
|
|
eval_episodes = np.arange(self.evaluation_frequency,
|
|
len(self.episode_rewards) + 1,
|
|
self.evaluation_frequency)[:len(self.win_rates)]
|
|
ax4.plot(eval_episodes, self.win_rates, 'purple', marker='o')
|
|
ax4.set_title('Win Rate')
|
|
ax4.set_xlabel('Episode')
|
|
ax4.set_ylabel('Win Rate')
|
|
ax4.grid(True)
|
|
ax4.set_ylim(0, 1)
|
|
|
|
plt.tight_layout()
|
|
|
|
if save_path:
|
|
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
|
logger.info(f"Training progress plot saved: {save_path}")
|
|
|
|
plt.show()
|
|
|
|
def log_episode_metrics(self, episode: int, metrics: Dict):
|
|
"""Log episode metrics to TensorBoard"""
|
|
# Main performance metrics
|
|
self.writer.add_scalar('Episode/TotalReward', metrics['total_reward'], episode)
|
|
self.writer.add_scalar('Episode/FinalBalance', metrics['final_balance'], episode)
|
|
self.writer.add_scalar('Episode/TotalReturn', metrics['total_return'], episode)
|
|
self.writer.add_scalar('Episode/Steps', metrics['steps'], episode)
|
|
|
|
# Trading metrics
|
|
self.writer.add_scalar('Trading/TotalTrades', metrics['total_trades'], episode)
|
|
self.writer.add_scalar('Trading/WinRate', metrics['win_rate'], episode)
|
|
self.writer.add_scalar('Trading/ProfitFactor', metrics.get('profit_factor', 0), episode)
|
|
self.writer.add_scalar('Trading/MaxDrawdown', metrics.get('max_drawdown', 0), episode)
|
|
|
|
# Agent metrics
|
|
self.writer.add_scalar('Agent/Epsilon', metrics['epsilon'], episode)
|
|
self.writer.add_scalar('Agent/LearningRate', metrics.get('learning_rate', self.learning_rate), episode)
|
|
self.writer.add_scalar('Agent/MemorySize', metrics.get('memory_size', 0), episode)
|
|
|
|
# Loss metrics (if available)
|
|
if 'loss' in metrics:
|
|
self.writer.add_scalar('Agent/Loss', metrics['loss'], episode)
|
|
|
|
class HybridTrainer:
|
|
"""
|
|
Hybrid training pipeline combining CNN and RL
|
|
"""
|
|
|
|
def __init__(self, data_provider: DataProvider):
|
|
self.data_provider = data_provider
|
|
self.cnn_trainer = None
|
|
self.rl_trainer = None
|
|
|
|
def train_hybrid(self, symbols: List[str], cnn_save_path: str, rl_save_path: str) -> Dict:
|
|
"""Train CNN first, then RL with CNN features"""
|
|
logger.info("Starting hybrid CNN + RL training...")
|
|
|
|
# Phase 1: Train CNN
|
|
logger.info("Phase 1: Training CNN...")
|
|
from training.cnn_trainer import CNNTrainer
|
|
|
|
self.cnn_trainer = CNNTrainer(self.data_provider)
|
|
cnn_results = self.cnn_trainer.train(symbols, cnn_save_path)
|
|
|
|
# Phase 2: Train RL
|
|
logger.info("Phase 2: Training RL...")
|
|
self.rl_trainer = RLTrainer(self.data_provider)
|
|
rl_results = self.rl_trainer.train(rl_save_path)
|
|
|
|
# Combine results
|
|
hybrid_results = {
|
|
'cnn_results': cnn_results,
|
|
'rl_results': rl_results,
|
|
'total_time': cnn_results['total_time'] + rl_results['total_time']
|
|
}
|
|
|
|
logger.info("Hybrid training completed!")
|
|
return hybrid_results
|
|
|
|
# Export
|
|
__all__ = ['RLTrainer', 'HybridTrainer'] |