import torch import numpy as np from torch.utils.tensorboard import SummaryWriter import logging import time from datetime import datetime import os import sys import pandas as pd import gym import json # Add parent directory to path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from NN.utils.data_interface import DataInterface from NN.utils.trading_env import TradingEnvironment from NN.models.dqn_agent import DQNAgent from NN.utils.signal_interpreter import SignalInterpreter # Configure logging logger = logging.getLogger(__name__) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('rl_training.log'), logging.StreamHandler() ] ) # Set up device for PyTorch (use GPU if available) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Log GPU status if torch.cuda.is_available(): gpu_count = torch.cuda.device_count() gpu_names = [torch.cuda.get_device_name(i) for i in range(gpu_count)] logger.info(f"Using GPU: {gpu_names}") # Enable TensorFloat32 for NVIDIA Ampere GPUs for faster training if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported(): logger.info("BFloat16 precision is supported - will use for faster training") else: logger.warning("GPU not available. Using CPU for training (slower).") class RLTradingEnvironment(gym.Env): """ Reinforcement Learning environment for trading with technical indicators from multiple timeframes """ def __init__(self, features_1m, features_5m, features_15m, window_size=20, trading_fee=0.0025, min_trade_interval=15): super().__init__() # Initialize attributes before parent class self.window_size = window_size self.num_features = features_1m.shape[1] - 1 # Exclude close price self.num_timeframes = 3 # 1m, 5m, 15m self.feature_dim = self.num_features * self.num_timeframes # Store features from different timeframes self.features_1m = features_1m self.features_5m = features_5m self.features_15m = features_15m # Trading parameters self.initial_balance = 1.0 self.trading_fee = trading_fee # Increased from 0.001 to 0.0025 (0.25%) self.min_trade_interval = min_trade_interval # Minimum steps between trades # Define action and observation spaces self.action_space = gym.spaces.Discrete(3) # 0: Buy, 1: Sell, 2: Hold self.observation_space = gym.spaces.Box( low=-np.inf, high=np.inf, shape=(self.window_size, self.feature_dim), dtype=np.float32 ) # State variables self.reset() # Callback for visualization or external monitoring self.action_callback = None def reset(self): """Reset the environment to initial state""" self.balance = self.initial_balance self.position = 0.0 # Amount of asset held self.current_step = self.window_size self.trades = 0 self.wins = 0 self.losses = 0 self.trade_history = [] self.last_trade_step = -self.min_trade_interval # Initialize to allow immediate first trade # Get initial observation observation = self._get_observation() return observation def _get_observation(self): """ Get the current state observation. Combine features from multiple timeframes, reshaped for the CNN. """ # Calculate indices for each timeframe idx_1m = self.current_step idx_5m = idx_1m // 5 idx_15m = idx_1m // 15 # Extract feature windows from each timeframe window_1m = self.features_1m[idx_1m - self.window_size:idx_1m] # Handle 5m timeframe start_5m = max(0, idx_5m - self.window_size) window_5m = self.features_5m[start_5m:idx_5m] # Handle 15m timeframe start_15m = max(0, idx_15m - self.window_size) window_15m = self.features_15m[start_15m:idx_15m] # Pad if needed (for 5m and 15m) if len(window_5m) < self.window_size: padding = np.zeros((self.window_size - len(window_5m), window_5m.shape[1])) window_5m = np.vstack([padding, window_5m]) if len(window_15m) < self.window_size: padding = np.zeros((self.window_size - len(window_15m), window_15m.shape[1])) window_15m = np.vstack([padding, window_15m]) # Combine features from all timeframes combined_features = np.hstack([ window_1m.reshape(self.window_size, -1), window_5m.reshape(self.window_size, -1), window_15m.reshape(self.window_size, -1) ]) # Convert to float32 and handle any NaN values combined_features = np.nan_to_num(combined_features, nan=0.0).astype(np.float32) return combined_features def step(self, action): """ Take an action in the environment and return the next state, reward, done flag, and info Args: action (int): 0 = Buy, 1 = Sell, 2 = Hold Returns: tuple: (observation, reward, done, info) """ # Get current and next price current_price = self.features_1m[self.current_step, -1] # Close price is last column next_price = self.features_1m[self.current_step + 1, -1] # Handle zero or negative prices if current_price <= 0: current_price = 1e-8 # Small positive number if next_price <= 0: next_price = current_price # Use current price if next price is invalid price_change = (next_price - current_price) / current_price # Default reward is slightly negative to discourage inaction reward = -0.0001 done = False profit_pct = None # Initialize profit_pct variable # Check if enough time has passed since last trade trade_interval = self.current_step - self.last_trade_step trade_interval_penalty = 0 # Execute action if action == 0: # BUY if self.position == 0: # Only buy if not already in position # Apply extra penalty for trading too frequently if trade_interval < self.min_trade_interval: trade_interval_penalty = -0.002 * (self.min_trade_interval - trade_interval) # Still allow the trade but with penalty self.position = self.balance * (1 - self.trading_fee) self.balance = 0 self.trades += 1 reward = -0.001 + trade_interval_penalty # Small cost for transaction + potential penalty self.trade_entry_price = current_price self.last_trade_step = self.current_step elif action == 1: # SELL if self.position > 0: # Only sell if in position # Apply extra penalty for trading too frequently if trade_interval < self.min_trade_interval: trade_interval_penalty = -0.002 * (self.min_trade_interval - trade_interval) # Still allow the trade but with penalty # Calculate position value at current price position_value = self.position * (1 + price_change) self.balance = position_value * (1 - self.trading_fee) # Calculate profit/loss from trade profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price # Scale reward by profit percentage and apply trade interval penalty reward = (profit_pct * 10) + trade_interval_penalty # Update win/loss count if profit_pct > 0: self.wins += 1 else: self.losses += 1 # Record trade self.trade_history.append({ 'entry_price': self.trade_entry_price, 'exit_price': next_price, 'profit_pct': profit_pct, 'trade_interval': trade_interval }) # Reset position and update last trade step self.position = 0 self.last_trade_step = self.current_step # else: (action == 2 - HOLD) - no position change # Move to next step self.current_step += 1 # Check if done if self.current_step >= len(self.features_1m) - 1: done = True # Apply final evaluation if self.position > 0: # Force close position at the end position_value = self.position * (1 + price_change) self.balance = position_value * (1 - self.trading_fee) profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price reward += profit_pct * 10 # Update win/loss count if profit_pct > 0: self.wins += 1 else: self.losses += 1 # Get the next observation observation = self._get_observation() # Calculate metrics for info total_value = self.balance + self.position * next_price gain = (total_value - self.initial_balance) / self.initial_balance self.win_rate = self.wins / max(1, self.trades) info = { 'balance': self.balance, 'position': self.position, 'total_value': total_value, 'gain': gain, 'trades': self.trades, 'win_rate': self.win_rate, 'profit_pct': profit_pct if action == 1 and self.position == 0 else None, 'current_price': current_price, 'next_price': next_price } # Call the callback if it exists if self.action_callback: self.action_callback(action, current_price, reward, info) return observation, reward, done, info def set_action_callback(self, callback): """ Set a callback function to be called after each action Args: callback: Function with signature (action, price, reward, info) """ self.action_callback = callback def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent", action_callback=None, episode_callback=None, symbol="BTC/USDT"): """ Train a reinforcement learning agent for trading Args: env_class: Optional environment class override num_episodes: Number of episodes to train for max_steps: Maximum steps per episode save_path: Path to save the trained model action_callback: Callback function for monitoring actions episode_callback: Callback function for monitoring episodes symbol: Trading symbol to use Returns: tuple: (trained agent, environment) """ # Load data for the selected symbol data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m']) try: # Try to load data for the requested symbol using get_historical_data method data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000) data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000) data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000) if data_1m is None or data_5m is None or data_15m is None: raise FileNotFoundError("Could not retrieve data for specified symbol") except Exception as e: logger.warning(f"Data for {symbol} not available: {str(e)}. Using default data.") # Try to use cached data if available symbol = "BTC/USDT" data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m']) data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000) data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000) data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000) if data_1m is None or data_5m is None or data_15m is None: logger.error("Failed to retrieve any data. Cannot continue training.") raise ValueError("No data available for training") # Create features from the data by adding technical indicators and converting to numpy format if data_1m is not None: data_1m = data_interface.add_technical_indicators(data_1m) # Convert to numpy array with close price as the last column features_1m = np.hstack([ data_1m.drop(['timestamp', 'close'], axis=1).values, data_1m['close'].values.reshape(-1, 1) ]) else: features_1m = None if data_5m is not None: data_5m = data_interface.add_technical_indicators(data_5m) # Convert to numpy array with close price as the last column features_5m = np.hstack([ data_5m.drop(['timestamp', 'close'], axis=1).values, data_5m['close'].values.reshape(-1, 1) ]) else: features_5m = None if data_15m is not None: data_15m = data_interface.add_technical_indicators(data_15m) # Convert to numpy array with close price as the last column features_15m = np.hstack([ data_15m.drop(['timestamp', 'close'], axis=1).values, data_15m['close'].values.reshape(-1, 1) ]) else: features_15m = None # Check if we have all the required features if features_1m is None or features_5m is None or features_15m is None: logger.error("Failed to create features for all timeframes.") raise ValueError("Could not create features for training") # Create the environment if env_class: # Use provided environment class env = env_class(features_1m, features_5m, features_15m) else: # Use the default environment env = RLTradingEnvironment(features_1m, features_5m, features_15m) # Set action callback if provided if action_callback: env.set_action_callback(action_callback) # Get environment properties for agent creation input_shape = env.observation_space.shape n_actions = env.action_space.n # Create the agent agent = DQNAgent( state_shape=input_shape, n_actions=n_actions, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, learning_rate=0.0001, gamma=0.99, buffer_size=10000, batch_size=64, device=device # Pass device to agent for GPU usage ) # Check if model file exists and load it model_file = f"{save_path}_model.pth" if os.path.exists(model_file): try: agent.load(model_file) logger.info(f"Loaded existing model from {model_file}") except Exception as e: logger.error(f"Error loading model: {e}") else: logger.info("No existing model found. Starting with a new model.") # Create TensorBoard writer writer = SummaryWriter(log_dir=f'runs/dqn_{int(time.time())}') # Log GPU status to TensorBoard writer.add_text("hardware/device", str(device), 0) if torch.cuda.is_available(): for i in range(torch.cuda.device_count()): writer.add_text(f"hardware/gpu_{i}", torch.cuda.get_device_name(i), 0) # Training loop total_rewards = [] trade_win_rates = [] best_reward = -np.inf # Move models to the appropriate device if not already there agent.move_models_to_device(device) # Enable mixed precision if GPU and feature is available use_mixed_precision = False if torch.cuda.is_available() and hasattr(torch.cuda, 'amp'): logger.info("Enabling mixed precision training") use_mixed_precision = True scaler = torch.cuda.amp.GradScaler() # Define step callback for tensorboard logging and model tracking def step_callback(action, price, reward, info): # Pass to external callback if provided if action_callback: action_callback(env.current_step, action, price, reward, info) # Main training loop logger.info(f"Starting training for {num_episodes} episodes...") logger.info(f"Starting training on device: {agent.device}") try: for episode in range(num_episodes): state = env.reset() total_reward = 0 for step in range(max_steps): # Select action action = agent.act(state) # Take action and observe next state and reward next_state, reward, done, info = env.step(action) # Store the experience in memory agent.remember(state, action, reward, next_state, done) # Update state and reward state = next_state total_reward += reward # Train the agent by sampling from memory if len(agent.memory) >= agent.batch_size: loss = agent.replay() if done or step == max_steps - 1: break # Track rewards total_rewards.append(total_reward) # Calculate trading metrics win_rate = env.win_rate if hasattr(env, 'win_rate') else 0 trades = env.trades if hasattr(env, 'trades') else 0 # Log to TensorBoard writer.add_scalar('Reward/Episode', total_reward, episode) writer.add_scalar('Trade/WinRate', win_rate, episode) writer.add_scalar('Trade/Count', trades, episode) # Save best model if total_reward > best_reward and episode > 10: logger.info(f"New best average reward: {total_reward:.4f}, saving model") agent.save(save_path) best_reward = total_reward # Periodic save every 100 episodes if episode % 100 == 0 and episode > 0: agent.save(f"{save_path}_episode_{episode}") # Call episode callback if provided if episode_callback: # Add environment to info dict to use for extrema training info_with_env = info.copy() info_with_env['env'] = env episode_callback(episode, total_reward, info_with_env) # Final save logger.info("Training completed, saving final model") agent.save(f"{save_path}_final") except Exception as e: logger.error(f"Training failed: {str(e)}") import traceback logger.error(traceback.format_exc()) # Close TensorBoard writer writer.close() return agent, env if __name__ == "__main__": train_rl()