import torch import numpy as np from torch.utils.tensorboard import SummaryWriter import logging import time from datetime import datetime import os import sys import pandas as pd import gym import json # Add parent directory to path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from NN.utils.data_interface import DataInterface from NN.utils.trading_env import TradingEnvironment from NN.models.dqn_agent import DQNAgent from NN.utils.signal_interpreter import SignalInterpreter # Configure logging logger = logging.getLogger(__name__) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('rl_training.log'), logging.StreamHandler() ] ) class RLTradingEnvironment(gym.Env): """ Reinforcement Learning environment for trading with technical indicators from multiple timeframes """ def __init__(self, features_1m, features_5m, features_15m, window_size=20, trading_fee=0.001): super().__init__() # Initialize attributes before parent class self.window_size = window_size self.num_features = features_1m.shape[1] - 1 # Exclude close price self.num_timeframes = 3 # 1m, 5m, 15m self.feature_dim = self.num_features * self.num_timeframes # Store features from different timeframes self.features_1m = features_1m self.features_5m = features_5m self.features_15m = features_15m # Trading parameters self.initial_balance = 1.0 self.trading_fee = trading_fee # Define action and observation spaces self.action_space = gym.spaces.Discrete(3) # 0: Buy, 1: Sell, 2: Hold self.observation_space = gym.spaces.Box( low=-np.inf, high=np.inf, shape=(self.window_size, self.feature_dim), dtype=np.float32 ) # State variables self.reset() def reset(self): """Reset the environment to initial state""" self.balance = self.initial_balance self.position = 0.0 # Amount of asset held self.current_step = self.window_size self.trades = 0 self.wins = 0 self.losses = 0 self.trade_history = [] # Get initial observation observation = self._get_observation() return observation def _get_observation(self): """ Get the current state observation. Combine features from multiple timeframes, reshaped for the CNN. """ # Calculate indices for each timeframe idx_1m = self.current_step idx_5m = idx_1m // 5 idx_15m = idx_1m // 15 # Extract feature windows from each timeframe window_1m = self.features_1m[idx_1m - self.window_size:idx_1m] # Handle 5m timeframe start_5m = max(0, idx_5m - self.window_size) window_5m = self.features_5m[start_5m:idx_5m] # Handle 15m timeframe start_15m = max(0, idx_15m - self.window_size) window_15m = self.features_15m[start_15m:idx_15m] # Pad if needed (for 5m and 15m) if len(window_5m) < self.window_size: padding = np.zeros((self.window_size - len(window_5m), window_5m.shape[1])) window_5m = np.vstack([padding, window_5m]) if len(window_15m) < self.window_size: padding = np.zeros((self.window_size - len(window_15m), window_15m.shape[1])) window_15m = np.vstack([padding, window_15m]) # Combine features from all timeframes combined_features = np.hstack([ window_1m.reshape(self.window_size, -1), window_5m.reshape(self.window_size, -1), window_15m.reshape(self.window_size, -1) ]) # Convert to float32 and handle any NaN values combined_features = np.nan_to_num(combined_features, nan=0.0).astype(np.float32) return combined_features def step(self, action): """ Take an action in the environment and return the next state, reward, done flag, and info Args: action (int): 0 = Buy, 1 = Sell, 2 = Hold Returns: tuple: (observation, reward, done, info) """ # Get current and next price current_price = self.features_1m[self.current_step, -1] # Close price is last column next_price = self.features_1m[self.current_step + 1, -1] # Handle zero or negative prices if current_price <= 0: current_price = 1e-8 # Small positive number if next_price <= 0: next_price = current_price # Use current price if next price is invalid price_change = (next_price - current_price) / current_price # Default reward is slightly negative to discourage inaction reward = -0.0001 done = False # Execute action if action == 0: # BUY if self.position == 0: # Only buy if not already in position self.position = self.balance * (1 - self.trading_fee) self.balance = 0 self.trades += 1 reward = 0 # Neutral reward for entering position self.trade_entry_price = current_price elif action == 1: # SELL if self.position > 0: # Only sell if in position # Calculate position value at current price position_value = self.position * (1 + price_change) self.balance = position_value * (1 - self.trading_fee) # Calculate profit/loss from trade profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price reward = profit_pct * 10 # Scale reward by profit percentage # Update win/loss count if profit_pct > 0: self.wins += 1 else: self.losses += 1 # Record trade self.trade_history.append({ 'entry_price': self.trade_entry_price, 'exit_price': next_price, 'profit_pct': profit_pct }) # Reset position self.position = 0 # else: (action == 2 - HOLD) - no position change # Move to next step self.current_step += 1 # Check if done if self.current_step >= len(self.features_1m) - 1: done = True # Apply final evaluation if self.position > 0: # Force close position at the end position_value = self.position * (1 + price_change) self.balance = position_value * (1 - self.trading_fee) profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price reward += profit_pct * 10 # Update win/loss count if profit_pct > 0: self.wins += 1 else: self.losses += 1 # Get the next observation observation = self._get_observation() # Calculate metrics for info total_value = self.balance + self.position * next_price gain = (total_value - self.initial_balance) / self.initial_balance self.win_rate = self.wins / max(1, self.trades) info = { 'balance': self.balance, 'position': self.position, 'total_value': total_value, 'gain': gain, 'trades': self.trades, 'win_rate': self.win_rate } return observation, reward, done, info def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent"): """ Train DQN agent for RL-based trading with extended training and monitoring """ logger.info("Starting extended RL training for trading...") # Environment setup window_size = 20 timeframes = ["1m", "5m", "15m"] trading_fee = 0.001 # Ensure save directory exists os.makedirs(os.path.dirname(save_path), exist_ok=True) # Setup TensorBoard for monitoring writer = SummaryWriter(f'runs/rl_training_{datetime.now().strftime("%Y%m%d_%H%M%S")}') # Data loading data_interface = DataInterface( symbol="BTC/USDT", timeframes=timeframes ) # Get training data for each timeframe with more data logger.info("Loading training data...") features_1m = data_interface.get_training_data("1m", n_candles=5000) if features_1m is not None: logger.info(f"Loaded {len(features_1m)} 1m candles") else: logger.error("Failed to load 1m data") return None features_5m = data_interface.get_training_data("5m", n_candles=2500) if features_5m is not None: logger.info(f"Loaded {len(features_5m)} 5m candles") else: logger.error("Failed to load 5m data") return None features_15m = data_interface.get_training_data("15m", n_candles=2500) if features_15m is not None: logger.info(f"Loaded {len(features_15m)} 15m candles") else: logger.error("Failed to load 15m data") return None if features_1m is None or features_5m is None or features_15m is None: logger.error("Failed to load training data") return None # Convert DataFrames to numpy arrays, excluding timestamp column features_1m = features_1m.drop('timestamp', axis=1, errors='ignore').values features_5m = features_5m.drop('timestamp', axis=1, errors='ignore').values features_15m = features_15m.drop('timestamp', axis=1, errors='ignore').values # Calculate number of features per timeframe num_features = features_1m.shape[1] # Number of features after dropping timestamp # Create environment env = RLTradingEnvironment( features_1m=features_1m, features_5m=features_5m, features_15m=features_15m, window_size=window_size, trading_fee=trading_fee ) # Create agent with adjusted parameters for longer training state_size = window_size action_size = 3 agent = DQNAgent( state_size=state_size, action_size=action_size, window_size=window_size, num_features=num_features, timeframes=timeframes, learning_rate=0.0005, # Reduced learning rate for stability gamma=0.99, # Increased discount factor epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.999, # Slower epsilon decay memory_size=50000, # Increased memory size batch_size=128 # Increased batch size ) # Variables to track best performance best_reward = float('-inf') best_episode = 0 best_pnl = float('-inf') best_win_rate = 0.0 # Training metrics episode_rewards = [] episode_pnls = [] episode_win_rates = [] episode_trades = [] # Check if previous best model exists and load it best_model_path = f"{save_path}_best" if os.path.exists(f"{best_model_path}_policy.pt"): try: logger.info(f"Loading previous best model from {best_model_path}") agent.load(best_model_path) metadata_path = f"{best_model_path}_metadata.json" if os.path.exists(metadata_path): with open(metadata_path, 'r') as f: metadata = json.load(f) best_reward = metadata.get('best_reward', best_reward) best_episode = metadata.get('best_episode', best_episode) best_pnl = metadata.get('best_pnl', best_pnl) best_win_rate = metadata.get('best_win_rate', best_win_rate) logger.info(f"Loaded previous best metrics - Reward: {best_reward:.4f}, PnL: {best_pnl:.4f}, Win Rate: {best_win_rate:.4f}") except Exception as e: logger.error(f"Error loading previous best model: {e}") # Training loop try: for episode in range(1, num_episodes + 1): state = env.reset() total_reward = 0 done = False steps = 0 while not done and steps < max_steps: action = agent.act(state) next_state, reward, done, info = env.step(action) agent.remember(state, action, reward, next_state, done) # Learn from experience loss = agent.replay() state = next_state total_reward += reward steps += 1 # Calculate episode metrics episode_rewards.append(total_reward) episode_pnls.append(info['gain']) episode_win_rates.append(info['win_rate']) episode_trades.append(info['trades']) # Log to TensorBoard writer.add_scalar('Reward/episode', total_reward, episode) writer.add_scalar('PnL/episode', info['gain'], episode) writer.add_scalar('WinRate/episode', info['win_rate'], episode) writer.add_scalar('Trades/episode', info['trades'], episode) writer.add_scalar('Epsilon/episode', agent.epsilon, episode) # Save the best model based on multiple metrics (only every 50 episodes) is_better = False if episode % 50 == 0: # Only check for saving every 50 episodes if (info['gain'] > best_pnl and info['win_rate'] > 0.5) or \ (info['gain'] > best_pnl * 1.1) or \ (info['win_rate'] > best_win_rate * 1.1): best_reward = total_reward best_episode = episode best_pnl = info['gain'] best_win_rate = info['win_rate'] agent.save(best_model_path) is_better = True # Save metadata about the best model metadata = { 'best_reward': best_reward, 'best_episode': best_episode, 'best_pnl': best_pnl, 'best_win_rate': best_win_rate, 'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } with open(f"{best_model_path}_metadata.json", 'w') as f: json.dump(metadata, f) # Log training progress if episode % 10 == 0: avg_reward = sum(episode_rewards[-10:]) / 10 avg_pnl = sum(episode_pnls[-10:]) / 10 avg_win_rate = sum(episode_win_rates[-10:]) / 10 avg_trades = sum(episode_trades[-10:]) / 10 status = "NEW BEST!" if is_better else "" logger.info(f"Episode {episode}/{num_episodes} {status}") logger.info(f"Metrics (last 10 episodes):") logger.info(f" Reward: {avg_reward:.4f}") logger.info(f" PnL: {avg_pnl:.4f}") logger.info(f" Win Rate: {avg_win_rate:.4f}") logger.info(f" Trades: {avg_trades:.2f}") logger.info(f" Epsilon: {agent.epsilon:.4f}") logger.info(f"Best so far - PnL: {best_pnl:.4f}, Win Rate: {best_win_rate:.4f}") except KeyboardInterrupt: logger.info("Training interrupted by user. Saving best model...") # Close TensorBoard writer writer.close() # Final logs logger.info(f"Training completed. Best model from episode {best_episode}") logger.info(f"Best metrics:") logger.info(f" Reward: {best_reward:.4f}") logger.info(f" PnL: {best_pnl:.4f}") logger.info(f" Win Rate: {best_win_rate:.4f}") # Return the agent for potential further use return agent if __name__ == "__main__": train_rl()