gogo2/NN/train_rl.py
2025-03-31 11:12:45 +03:00

414 lines
16 KiB
Python

import torch
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import logging
import time
from datetime import datetime
import os
import sys
import pandas as pd
import gym
import json
# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from NN.utils.data_interface import DataInterface
from NN.utils.trading_env import TradingEnvironment
from NN.models.dqn_agent import DQNAgent
from NN.utils.signal_interpreter import SignalInterpreter
# Configure logging
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('rl_training.log'),
logging.StreamHandler()
]
)
class RLTradingEnvironment(gym.Env):
"""
Reinforcement Learning environment for trading with technical indicators
from multiple timeframes
"""
def __init__(self, features_1m, features_5m, features_15m, window_size=20, trading_fee=0.001):
super().__init__()
# Initialize attributes before parent class
self.window_size = window_size
self.num_features = features_1m.shape[1] - 1 # Exclude close price
self.num_timeframes = 3 # 1m, 5m, 15m
self.feature_dim = self.num_features * self.num_timeframes
# Store features from different timeframes
self.features_1m = features_1m
self.features_5m = features_5m
self.features_15m = features_15m
# Trading parameters
self.initial_balance = 1.0
self.trading_fee = trading_fee
# Define action and observation spaces
self.action_space = gym.spaces.Discrete(3) # 0: Buy, 1: Sell, 2: Hold
self.observation_space = gym.spaces.Box(
low=-np.inf,
high=np.inf,
shape=(self.window_size, self.feature_dim),
dtype=np.float32
)
# State variables
self.reset()
def reset(self):
"""Reset the environment to initial state"""
self.balance = self.initial_balance
self.position = 0.0 # Amount of asset held
self.current_step = self.window_size
self.trades = 0
self.wins = 0
self.losses = 0
self.trade_history = []
# Get initial observation
observation = self._get_observation()
return observation
def _get_observation(self):
"""
Get the current state observation.
Combine features from multiple timeframes, reshaped for the CNN.
"""
# Calculate indices for each timeframe
idx_1m = self.current_step
idx_5m = idx_1m // 5
idx_15m = idx_1m // 15
# Extract feature windows from each timeframe
window_1m = self.features_1m[idx_1m - self.window_size:idx_1m]
# Handle 5m timeframe
start_5m = max(0, idx_5m - self.window_size)
window_5m = self.features_5m[start_5m:idx_5m]
# Handle 15m timeframe
start_15m = max(0, idx_15m - self.window_size)
window_15m = self.features_15m[start_15m:idx_15m]
# Pad if needed (for 5m and 15m)
if len(window_5m) < self.window_size:
padding = np.zeros((self.window_size - len(window_5m), window_5m.shape[1]))
window_5m = np.vstack([padding, window_5m])
if len(window_15m) < self.window_size:
padding = np.zeros((self.window_size - len(window_15m), window_15m.shape[1]))
window_15m = np.vstack([padding, window_15m])
# Combine features from all timeframes
combined_features = np.hstack([
window_1m.reshape(self.window_size, -1),
window_5m.reshape(self.window_size, -1),
window_15m.reshape(self.window_size, -1)
])
# Convert to float32 and handle any NaN values
combined_features = np.nan_to_num(combined_features, nan=0.0).astype(np.float32)
return combined_features
def step(self, action):
"""
Take an action in the environment and return the next state, reward, done flag, and info
Args:
action (int): 0 = Buy, 1 = Sell, 2 = Hold
Returns:
tuple: (observation, reward, done, info)
"""
# Get current and next price
current_price = self.features_1m[self.current_step, -1] # Close price is last column
next_price = self.features_1m[self.current_step + 1, -1]
# Handle zero or negative prices
if current_price <= 0:
current_price = 1e-8 # Small positive number
if next_price <= 0:
next_price = current_price # Use current price if next price is invalid
price_change = (next_price - current_price) / current_price
# Default reward is slightly negative to discourage inaction
reward = -0.0001
done = False
# Execute action
if action == 0: # BUY
if self.position == 0: # Only buy if not already in position
self.position = self.balance * (1 - self.trading_fee)
self.balance = 0
self.trades += 1
reward = 0 # Neutral reward for entering position
self.trade_entry_price = current_price
elif action == 1: # SELL
if self.position > 0: # Only sell if in position
# Calculate position value at current price
position_value = self.position * (1 + price_change)
self.balance = position_value * (1 - self.trading_fee)
# Calculate profit/loss from trade
profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price
reward = profit_pct * 10 # Scale reward by profit percentage
# Update win/loss count
if profit_pct > 0:
self.wins += 1
else:
self.losses += 1
# Record trade
self.trade_history.append({
'entry_price': self.trade_entry_price,
'exit_price': next_price,
'profit_pct': profit_pct
})
# Reset position
self.position = 0
# else: (action == 2 - HOLD) - no position change
# Move to next step
self.current_step += 1
# Check if done
if self.current_step >= len(self.features_1m) - 1:
done = True
# Apply final evaluation
if self.position > 0:
# Force close position at the end
position_value = self.position * (1 + price_change)
self.balance = position_value * (1 - self.trading_fee)
profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price
reward += profit_pct * 10
# Update win/loss count
if profit_pct > 0:
self.wins += 1
else:
self.losses += 1
# Get the next observation
observation = self._get_observation()
# Calculate metrics for info
total_value = self.balance + self.position * next_price
gain = (total_value - self.initial_balance) / self.initial_balance
self.win_rate = self.wins / max(1, self.trades)
info = {
'balance': self.balance,
'position': self.position,
'total_value': total_value,
'gain': gain,
'trades': self.trades,
'win_rate': self.win_rate
}
return observation, reward, done, info
def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent"):
"""
Train DQN agent for RL-based trading with extended training and monitoring
"""
logger.info("Starting extended RL training for trading...")
# Environment setup
window_size = 20
timeframes = ["1m", "5m", "15m"]
trading_fee = 0.001
# Ensure save directory exists
os.makedirs(os.path.dirname(save_path), exist_ok=True)
# Setup TensorBoard for monitoring
writer = SummaryWriter(f'runs/rl_training_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
# Data loading
data_interface = DataInterface(
symbol="BTC/USDT",
timeframes=timeframes
)
# Get training data for each timeframe with more data
features_1m = data_interface.get_training_data("1m", n_candles=5000)
features_5m = data_interface.get_training_data("5m", n_candles=2500)
features_15m = data_interface.get_training_data("15m", n_candles=2500)
if features_1m is None or features_5m is None or features_15m is None:
logger.error("Failed to load training data")
return None
# Convert DataFrames to numpy arrays, excluding timestamp column
features_1m = features_1m.drop('timestamp', axis=1, errors='ignore').values
features_5m = features_5m.drop('timestamp', axis=1, errors='ignore').values
features_15m = features_15m.drop('timestamp', axis=1, errors='ignore').values
# Calculate number of features per timeframe
num_features = features_1m.shape[1] # Number of features after dropping timestamp
# Create environment
env = RLTradingEnvironment(
features_1m=features_1m,
features_5m=features_5m,
features_15m=features_15m,
window_size=window_size,
trading_fee=trading_fee
)
# Create agent with adjusted parameters for longer training
state_size = window_size
action_size = 3
agent = DQNAgent(
state_size=state_size,
action_size=action_size,
window_size=window_size,
num_features=num_features,
timeframes=timeframes,
learning_rate=0.0005, # Reduced learning rate for stability
gamma=0.99, # Increased discount factor
epsilon=1.0,
epsilon_min=0.01,
epsilon_decay=0.999, # Slower epsilon decay
memory_size=50000, # Increased memory size
batch_size=128 # Increased batch size
)
# Variables to track best performance
best_reward = float('-inf')
best_episode = 0
best_pnl = float('-inf')
best_win_rate = 0.0
# Training metrics
episode_rewards = []
episode_pnls = []
episode_win_rates = []
episode_trades = []
# Check if previous best model exists and load it
best_model_path = f"{save_path}_best"
if os.path.exists(f"{best_model_path}_policy.pt"):
try:
logger.info(f"Loading previous best model from {best_model_path}")
agent.load(best_model_path)
metadata_path = f"{best_model_path}_metadata.json"
if os.path.exists(metadata_path):
with open(metadata_path, 'r') as f:
metadata = json.load(f)
best_reward = metadata.get('best_reward', best_reward)
best_episode = metadata.get('best_episode', best_episode)
best_pnl = metadata.get('best_pnl', best_pnl)
best_win_rate = metadata.get('best_win_rate', best_win_rate)
logger.info(f"Loaded previous best metrics - Reward: {best_reward:.4f}, PnL: {best_pnl:.4f}, Win Rate: {best_win_rate:.4f}")
except Exception as e:
logger.error(f"Error loading previous best model: {e}")
# Training loop
try:
for episode in range(1, num_episodes + 1):
state = env.reset()
total_reward = 0
done = False
steps = 0
while not done and steps < max_steps:
action = agent.act(state)
next_state, reward, done, info = env.step(action)
agent.remember(state, action, reward, next_state, done)
# Learn from experience
loss = agent.replay()
state = next_state
total_reward += reward
steps += 1
# Calculate episode metrics
episode_rewards.append(total_reward)
episode_pnls.append(info['gain'])
episode_win_rates.append(info['win_rate'])
episode_trades.append(info['trades'])
# Log to TensorBoard
writer.add_scalar('Reward/episode', total_reward, episode)
writer.add_scalar('PnL/episode', info['gain'], episode)
writer.add_scalar('WinRate/episode', info['win_rate'], episode)
writer.add_scalar('Trades/episode', info['trades'], episode)
writer.add_scalar('Epsilon/episode', agent.epsilon, episode)
# Save the best model based on multiple metrics (only every 50 episodes)
is_better = False
if episode % 50 == 0: # Only check for saving every 50 episodes
if (info['gain'] > best_pnl and info['win_rate'] > 0.5) or \
(info['gain'] > best_pnl * 1.1) or \
(info['win_rate'] > best_win_rate * 1.1):
best_reward = total_reward
best_episode = episode
best_pnl = info['gain']
best_win_rate = info['win_rate']
agent.save(best_model_path)
is_better = True
# Save metadata about the best model
metadata = {
'best_reward': best_reward,
'best_episode': best_episode,
'best_pnl': best_pnl,
'best_win_rate': best_win_rate,
'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
with open(f"{best_model_path}_metadata.json", 'w') as f:
json.dump(metadata, f)
# Log training progress
if episode % 10 == 0:
avg_reward = sum(episode_rewards[-10:]) / 10
avg_pnl = sum(episode_pnls[-10:]) / 10
avg_win_rate = sum(episode_win_rates[-10:]) / 10
avg_trades = sum(episode_trades[-10:]) / 10
status = "NEW BEST!" if is_better else ""
logger.info(f"Episode {episode}/{num_episodes} {status}")
logger.info(f"Metrics (last 10 episodes):")
logger.info(f" Reward: {avg_reward:.4f}")
logger.info(f" PnL: {avg_pnl:.4f}")
logger.info(f" Win Rate: {avg_win_rate:.4f}")
logger.info(f" Trades: {avg_trades:.2f}")
logger.info(f" Epsilon: {agent.epsilon:.4f}")
logger.info(f"Best so far - PnL: {best_pnl:.4f}, Win Rate: {best_win_rate:.4f}")
except KeyboardInterrupt:
logger.info("Training interrupted by user. Saving best model...")
# Close TensorBoard writer
writer.close()
# Final logs
logger.info(f"Training completed. Best model from episode {best_episode}")
logger.info(f"Best metrics:")
logger.info(f" Reward: {best_reward:.4f}")
logger.info(f" PnL: {best_pnl:.4f}")
logger.info(f" Win Rate: {best_win_rate:.4f}")
# Return the agent for potential further use
return agent
if __name__ == "__main__":
train_rl()