diff --git a/NN/models/simple_cnn.py b/NN/models/simple_cnn.py index 6ff6d1b..f5e491b 100644 --- a/NN/models/simple_cnn.py +++ b/NN/models/simple_cnn.py @@ -13,10 +13,9 @@ logger = logging.getLogger(__name__) class CNNModelPyTorch(nn.Module): """ - CNN model for trading signals - Simplified version for RL training + CNN model for trading with multiple timeframes """ - def __init__(self, window_size: int, num_features: int, output_size: int, timeframes: List[str]): + def __init__(self, window_size, num_features, output_size, timeframes): super(CNNModelPyTorch, self).__init__() self.window_size = window_size @@ -24,12 +23,33 @@ class CNNModelPyTorch(nn.Module): self.output_size = output_size self.timeframes = timeframes + # Calculate total input features across all timeframes + self.total_features = num_features * len(timeframes) + # Device configuration self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info(f"Using device: {self.device}") - # Build model - self.build_model() + # Convolutional layers + self.conv1 = nn.Conv1d(self.total_features, 64, kernel_size=3, padding=1) + self.bn1 = nn.BatchNorm1d(64) + + self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm1d(128) + + self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1) + self.bn3 = nn.BatchNorm1d(256) + + # Calculate size after convolutions + conv_output_size = window_size * 256 + + # Fully connected layers + self.fc1 = nn.Linear(conv_output_size, 512) + self.fc2 = nn.Linear(512, 256) + + # Advantage and Value streams (Dueling DQN architecture) + self.fc3 = nn.Linear(256, output_size) # Advantage stream + self.value_fc = nn.Linear(256, 1) # Value stream # Initialize optimizer and scheduler self.optimizer = optim.Adam(self.parameters(), lr=0.001) @@ -40,36 +60,6 @@ class CNNModelPyTorch(nn.Module): # Move model to device self.to(self.device) - def build_model(self): - """Build the CNN architecture""" - # First Convolutional Layer - self.conv1 = nn.Conv1d( - in_channels=self.num_features * len(self.timeframes), - out_channels=32, - kernel_size=3, - padding=1 - ) - self.bn1 = nn.BatchNorm1d(32) - - # Second Convolutional Layer - self.conv2 = nn.Conv1d(32, 64, kernel_size=3, padding=1) - self.bn2 = nn.BatchNorm1d(64) - - # Third Convolutional Layer - self.conv3 = nn.Conv1d(64, 128, kernel_size=3, padding=1) - self.bn3 = nn.BatchNorm1d(128) - - # Calculate size after convolutions - conv_out_size = self.window_size * 128 - - # Fully connected layers - self.fc1 = nn.Linear(conv_out_size, 512) - self.fc2 = nn.Linear(512, 256) - self.fc3 = nn.Linear(256, self.output_size) - - # Additional output for value estimation - self.value_fc = nn.Linear(256, 1) - def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Forward pass through the network""" # Ensure input is on the correct device diff --git a/NN/train_rl.py b/NN/train_rl.py index a729abc..c052e3e 100644 --- a/NN/train_rl.py +++ b/NN/train_rl.py @@ -8,6 +8,7 @@ import os import sys import pandas as pd import gym +import json # Add parent directory to path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -28,165 +29,386 @@ logging.basicConfig( ] ) -class RLTradingEnvironment(TradingEnvironment): - """Extended trading environment that reshapes state for CNN""" - def __init__(self, data, window_size, num_features, num_timeframes, **kwargs): - # Set attributes before parent initialization +class RLTradingEnvironment(gym.Env): + """ + Reinforcement Learning environment for trading with technical indicators + from multiple timeframes + """ + def __init__(self, features_1m, features_5m, features_15m, window_size=20, trading_fee=0.001): + super().__init__() + + # Initialize attributes before parent class self.window_size = window_size - self.num_features = num_features - self.num_timeframes = num_timeframes - self.feature_dim = num_features * num_timeframes + self.num_features = features_1m.shape[1] - 1 # Exclude close price + self.num_timeframes = 3 # 1m, 5m, 15m + self.feature_dim = self.num_features * self.num_timeframes - # Initialize parent class - super().__init__(data=data, **kwargs) + # Store features from different timeframes + self.features_1m = features_1m + self.features_5m = features_5m + self.features_15m = features_15m - # Update observation space for CNN + # Trading parameters + self.initial_balance = 1.0 + self.trading_fee = trading_fee + + # Define action and observation spaces + self.action_space = gym.spaces.Discrete(3) # 0: Buy, 1: Sell, 2: Hold self.observation_space = gym.spaces.Box( - low=-np.inf, - high=np.inf, + low=-np.inf, + high=np.inf, shape=(self.window_size, self.feature_dim), dtype=np.float32 ) + # State variables + self.reset() + + def reset(self): + """Reset the environment to initial state""" + self.balance = self.initial_balance + self.position = 0.0 # Amount of asset held + self.current_step = self.window_size + self.trades = 0 + self.wins = 0 + self.losses = 0 + self.trade_history = [] + + # Get initial observation + observation = self._get_observation() + return observation + def _get_observation(self): - """Get current observation reshaped for CNN""" - # Get flattened observation from parent class - flat_obs = super()._get_observation() + """ + Get the current state observation. + Combine features from multiple timeframes, reshaped for the CNN. + """ + # Calculate indices for each timeframe + idx_1m = self.current_step + idx_5m = idx_1m // 5 + idx_15m = idx_1m // 15 - # Extract features (exclude close price) - features = flat_obs[:-1] # Remove close price + # Extract feature windows from each timeframe + window_1m = self.features_1m[idx_1m - self.window_size:idx_1m] - # Calculate number of complete windows - n_windows = len(features) // self.feature_dim - if n_windows < self.window_size: - # Pad with zeros if not enough data - padding = np.zeros((self.window_size - n_windows, self.feature_dim)) - reshaped = np.vstack([ - padding, - features[-(n_windows * self.feature_dim):].reshape(n_windows, self.feature_dim) - ]) - else: - # Take the most recent window_size windows - start_idx = (n_windows - self.window_size) * self.feature_dim - reshaped = features[start_idx:].reshape(self.window_size, self.feature_dim) + # Handle 5m timeframe + start_5m = max(0, idx_5m - self.window_size) + window_5m = self.features_5m[start_5m:idx_5m] - return reshaped.astype(np.float32) + # Handle 15m timeframe + start_15m = max(0, idx_15m - self.window_size) + window_15m = self.features_15m[start_15m:idx_15m] + + # Pad if needed (for 5m and 15m) + if len(window_5m) < self.window_size: + padding = np.zeros((self.window_size - len(window_5m), window_5m.shape[1])) + window_5m = np.vstack([padding, window_5m]) + + if len(window_15m) < self.window_size: + padding = np.zeros((self.window_size - len(window_15m), window_15m.shape[1])) + window_15m = np.vstack([padding, window_15m]) + + # Combine features from all timeframes + combined_features = np.hstack([ + window_1m.reshape(self.window_size, -1), + window_5m.reshape(self.window_size, -1), + window_15m.reshape(self.window_size, -1) + ]) + + # Convert to float32 and handle any NaN values + combined_features = np.nan_to_num(combined_features, nan=0.0).astype(np.float32) + + return combined_features + + def step(self, action): + """ + Take an action in the environment and return the next state, reward, done flag, and info + + Args: + action (int): 0 = Buy, 1 = Sell, 2 = Hold + + Returns: + tuple: (observation, reward, done, info) + """ + # Get current and next price + current_price = self.features_1m[self.current_step, -1] # Close price is last column + next_price = self.features_1m[self.current_step + 1, -1] + + # Handle zero or negative prices + if current_price <= 0: + current_price = 1e-8 # Small positive number + if next_price <= 0: + next_price = current_price # Use current price if next price is invalid + + price_change = (next_price - current_price) / current_price + + # Default reward is slightly negative to discourage inaction + reward = -0.0001 + done = False + + # Execute action + if action == 0: # BUY + if self.position == 0: # Only buy if not already in position + self.position = self.balance * (1 - self.trading_fee) + self.balance = 0 + self.trades += 1 + reward = 0 # Neutral reward for entering position + self.trade_entry_price = current_price + + elif action == 1: # SELL + if self.position > 0: # Only sell if in position + # Calculate position value at current price + position_value = self.position * (1 + price_change) + self.balance = position_value * (1 - self.trading_fee) + + # Calculate profit/loss from trade + profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price + reward = profit_pct * 10 # Scale reward by profit percentage + + # Update win/loss count + if profit_pct > 0: + self.wins += 1 + else: + self.losses += 1 + + # Record trade + self.trade_history.append({ + 'entry_price': self.trade_entry_price, + 'exit_price': next_price, + 'profit_pct': profit_pct + }) + + # Reset position + self.position = 0 + + # else: (action == 2 - HOLD) - no position change + + # Move to next step + self.current_step += 1 + + # Check if done + if self.current_step >= len(self.features_1m) - 1: + done = True + + # Apply final evaluation + if self.position > 0: + # Force close position at the end + position_value = self.position * (1 + price_change) + self.balance = position_value * (1 - self.trading_fee) + profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price + reward += profit_pct * 10 + + # Update win/loss count + if profit_pct > 0: + self.wins += 1 + else: + self.losses += 1 + + # Get the next observation + observation = self._get_observation() + + # Calculate metrics for info + total_value = self.balance + self.position * next_price + gain = (total_value - self.initial_balance) / self.initial_balance + self.win_rate = self.wins / max(1, self.trades) + + info = { + 'balance': self.balance, + 'position': self.position, + 'total_value': total_value, + 'gain': gain, + 'trades': self.trades, + 'win_rate': self.win_rate + } + + return observation, reward, done, info -def train_rl(): +def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent"): """ - Train the RL model using the DQN agent + Train DQN agent for RL-based trading with extended training and monitoring """ - # Initialize data interface with BTC/USDT and multiple timeframes - timeframes = ['1m', '5m', '15m'] + logger.info("Starting extended RL training for trading...") + + # Environment setup window_size = 20 - data_interface = DataInterface(symbol="BTC/USDT", timeframes=timeframes) + timeframes = ["1m", "5m", "15m"] + trading_fee = 0.001 - # Get training data - X_train, y_train, X_val, y_val, train_prices, val_prices = data_interface.prepare_training_data() - if X_train is None: - logger.error("Failed to get training data") - return + # Ensure save directory exists + os.makedirs(os.path.dirname(save_path), exist_ok=True) - # Calculate feature dimensions - num_features = X_train.shape[2] # Number of features per timeframe - total_features = num_features * len(timeframes) # Total features across all timeframes + # Setup TensorBoard for monitoring + writer = SummaryWriter(f'runs/rl_training_{datetime.now().strftime("%Y%m%d_%H%M%S")}') - # Flatten features for environment - n_samples = X_train.shape[0] - flattened_features = X_train.reshape(n_samples, window_size, -1) # Reshape to (batch, window, features) + # Data loading + data_interface = DataInterface( + symbol="BTC/USDT", + timeframes=timeframes + ) - # Create DataFrame with features as separate columns - feature_columns = [f'feature_{i}' for i in range(flattened_features.shape[2])] - df = pd.DataFrame(flattened_features.reshape(n_samples, -1), columns=feature_columns * window_size) - df['close'] = train_prices + # Get training data for each timeframe with more data + features_1m = data_interface.get_training_data("1m", n_candles=5000) + features_5m = data_interface.get_training_data("5m", n_candles=2500) + features_15m = data_interface.get_training_data("15m", n_candles=2500) + + if features_1m is None or features_5m is None or features_15m is None: + logger.error("Failed to load training data") + return None + + # Convert DataFrames to numpy arrays, excluding timestamp column + features_1m = features_1m.drop('timestamp', axis=1, errors='ignore').values + features_5m = features_5m.drop('timestamp', axis=1, errors='ignore').values + features_15m = features_15m.drop('timestamp', axis=1, errors='ignore').values + + # Calculate number of features per timeframe + num_features = features_1m.shape[1] # Number of features after dropping timestamp # Create environment env = RLTradingEnvironment( - data=df, + features_1m=features_1m, + features_5m=features_5m, + features_15m=features_15m, window_size=window_size, - num_features=num_features, - num_timeframes=len(timeframes), - initial_balance=10000, - fee_rate=0.001, - max_steps=1000 + trading_fee=trading_fee ) - # Create DQN agent + # Create agent with adjusted parameters for longer training + state_size = window_size + action_size = 3 agent = DQNAgent( - state_size=window_size, # First dimension of observation space - action_size=env.action_space.n, + state_size=state_size, + action_size=action_size, window_size=window_size, num_features=num_features, timeframes=timeframes, - learning_rate=0.001, - gamma=0.99, + learning_rate=0.0005, # Reduced learning rate for stability + gamma=0.99, # Increased discount factor epsilon=1.0, epsilon_min=0.01, - epsilon_decay=0.995, - memory_size=10000, - batch_size=32, - target_update=10 + epsilon_decay=0.999, # Slower epsilon decay + memory_size=50000, # Increased memory size + batch_size=128 # Increased batch size ) - # Training parameters - episodes = 1000 - max_steps = 1000 + # Variables to track best performance best_reward = float('-inf') - best_model_path = 'NN/models/saved/best_rl_model.pth' + best_episode = 0 + best_pnl = float('-inf') + best_win_rate = 0.0 - # Create models directory if it doesn't exist - os.makedirs(os.path.dirname(best_model_path), exist_ok=True) + # Training metrics + episode_rewards = [] + episode_pnls = [] + episode_win_rates = [] + episode_trades = [] + + # Check if previous best model exists and load it + best_model_path = f"{save_path}_best" + if os.path.exists(f"{best_model_path}_policy.pt"): + try: + logger.info(f"Loading previous best model from {best_model_path}") + agent.load(best_model_path) + metadata_path = f"{best_model_path}_metadata.json" + if os.path.exists(metadata_path): + with open(metadata_path, 'r') as f: + metadata = json.load(f) + best_reward = metadata.get('best_reward', best_reward) + best_episode = metadata.get('best_episode', best_episode) + best_pnl = metadata.get('best_pnl', best_pnl) + best_win_rate = metadata.get('best_win_rate', best_win_rate) + logger.info(f"Loaded previous best metrics - Reward: {best_reward:.4f}, PnL: {best_pnl:.4f}, Win Rate: {best_win_rate:.4f}") + except Exception as e: + logger.error(f"Error loading previous best model: {e}") # Training loop - for episode in range(episodes): - state = env.reset() - total_reward = 0 - - for step in range(max_steps): - # Get action from agent - action = agent.act(state) + try: + for episode in range(1, num_episodes + 1): + state = env.reset() + total_reward = 0 + done = False + steps = 0 - # Take action in environment - next_state, reward, done, info = env.step(action) - - # Store experience in agent's memory - agent.remember(state, action, reward, next_state, done) - - # Train agent - if len(agent.memory) > agent.batch_size: + while not done and steps < max_steps: + action = agent.act(state) + next_state, reward, done, info = env.step(action) + agent.remember(state, action, reward, next_state, done) + + # Learn from experience loss = agent.replay() - if loss is not None: - logger.debug(f"Training loss: {loss:.4f}") + + state = next_state + total_reward += reward + steps += 1 - # Update state and reward - state = next_state - total_reward += reward + # Calculate episode metrics + episode_rewards.append(total_reward) + episode_pnls.append(info['gain']) + episode_win_rates.append(info['win_rate']) + episode_trades.append(info['trades']) - if done: - break - - # Update epsilon - agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay) - - # Log episode results - logger.info(f"Episode: {episode + 1}/{episodes}") - logger.info(f"Total Reward: {total_reward:.2f}") - logger.info(f"Final Balance: {info['balance']:.2f}") - logger.info(f"Max Drawdown: {info['max_drawdown']:.2%}") - logger.info(f"Win Rate: {info['win_rate']:.2%}") - logger.info(f"Epsilon: {agent.epsilon:.4f}") - - # Save best model - if total_reward > best_reward: - best_reward = total_reward - agent.save(best_model_path) - logger.info(f"New best model saved with reward: {best_reward:.2f}") - - # Save checkpoint every 100 episodes - if (episode + 1) % 100 == 0: - checkpoint_path = f'NN/models/saved/rl_model_episode_{episode + 1}.pth' - agent.save(checkpoint_path) - logger.info(f"Checkpoint saved at episode {episode + 1}") + # Log to TensorBoard + writer.add_scalar('Reward/episode', total_reward, episode) + writer.add_scalar('PnL/episode', info['gain'], episode) + writer.add_scalar('WinRate/episode', info['win_rate'], episode) + writer.add_scalar('Trades/episode', info['trades'], episode) + writer.add_scalar('Epsilon/episode', agent.epsilon, episode) + + # Save the best model based on multiple metrics (only every 50 episodes) + is_better = False + if episode % 50 == 0: # Only check for saving every 50 episodes + if (info['gain'] > best_pnl and info['win_rate'] > 0.5) or \ + (info['gain'] > best_pnl * 1.1) or \ + (info['win_rate'] > best_win_rate * 1.1): + best_reward = total_reward + best_episode = episode + best_pnl = info['gain'] + best_win_rate = info['win_rate'] + agent.save(best_model_path) + is_better = True + + # Save metadata about the best model + metadata = { + 'best_reward': best_reward, + 'best_episode': best_episode, + 'best_pnl': best_pnl, + 'best_win_rate': best_win_rate, + 'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + with open(f"{best_model_path}_metadata.json", 'w') as f: + json.dump(metadata, f) + + # Log training progress + if episode % 10 == 0: + avg_reward = sum(episode_rewards[-10:]) / 10 + avg_pnl = sum(episode_pnls[-10:]) / 10 + avg_win_rate = sum(episode_win_rates[-10:]) / 10 + avg_trades = sum(episode_trades[-10:]) / 10 + + status = "NEW BEST!" if is_better else "" + logger.info(f"Episode {episode}/{num_episodes} {status}") + logger.info(f"Metrics (last 10 episodes):") + logger.info(f" Reward: {avg_reward:.4f}") + logger.info(f" PnL: {avg_pnl:.4f}") + logger.info(f" Win Rate: {avg_win_rate:.4f}") + logger.info(f" Trades: {avg_trades:.2f}") + logger.info(f" Epsilon: {agent.epsilon:.4f}") + logger.info(f"Best so far - PnL: {best_pnl:.4f}, Win Rate: {best_win_rate:.4f}") + + except KeyboardInterrupt: + logger.info("Training interrupted by user. Saving best model...") + + # Close TensorBoard writer + writer.close() + + # Final logs + logger.info(f"Training completed. Best model from episode {best_episode}") + logger.info(f"Best metrics:") + logger.info(f" Reward: {best_reward:.4f}") + logger.info(f" PnL: {best_pnl:.4f}") + logger.info(f" Win Rate: {best_win_rate:.4f}") + + # Return the agent for potential further use + return agent if __name__ == "__main__": train_rl() \ No newline at end of file diff --git a/NN/utils/data_interface.py b/NN/utils/data_interface.py index dd595d0..c3a1170 100644 --- a/NN/utils/data_interface.py +++ b/NN/utils/data_interface.py @@ -303,6 +303,76 @@ class DataInterface: """ return len(self.timeframes) * 5 # OHLCV features for each timeframe + def get_features(self, timeframe, n_candles=1000): + """ + Get feature data with technical indicators for a specific timeframe. + + Args: + timeframe (str): Timeframe to get features for ('1m', '5m', etc.) + n_candles (int): Number of candles to get + + Returns: + np.ndarray: Array of feature data including technical indicators + and the close price as the last column + """ + # Get historical data + df = self.get_historical_data(timeframe=timeframe, n_candles=n_candles) + + if df is None or df.empty: + logger.error(f"No data available for {self.symbol} {timeframe}") + return None + + # Add technical indicators + df = self.add_technical_indicators(df) + + # Drop NaN values that might have been introduced by indicators + df = df.dropna() + + # Extract features (all columns except timestamp) + features = df.drop('timestamp', axis=1).values + + logger.info(f"Prepared {len(features)} {timeframe} feature rows with {features.shape[1]} features") + return features + + def add_technical_indicators(self, df): + """ + Add technical indicators to the dataframe. + + Args: + df (pd.DataFrame): DataFrame with OHLCV data + + Returns: + pd.DataFrame: DataFrame with added technical indicators + """ + # Make a copy to avoid modifying the original + df_copy = df.copy() + + # Basic price indicators + df_copy['returns'] = df_copy['close'].pct_change() + df_copy['log_returns'] = np.log(df_copy['close']/df_copy['close'].shift(1)) + + # Moving Averages + df_copy['sma_7'] = ta.trend.sma_indicator(df_copy['close'], window=7) + df_copy['sma_25'] = ta.trend.sma_indicator(df_copy['close'], window=25) + df_copy['sma_99'] = ta.trend.sma_indicator(df_copy['close'], window=99) + + # MACD + macd = ta.trend.MACD(df_copy['close']) + df_copy['macd'] = macd.macd() + df_copy['macd_signal'] = macd.macd_signal() + df_copy['macd_diff'] = macd.macd_diff() + + # RSI + df_copy['rsi'] = ta.momentum.rsi(df_copy['close'], window=14) + + # Bollinger Bands + bollinger = ta.volatility.BollingerBands(df_copy['close']) + df_copy['bb_high'] = bollinger.bollinger_hband() + df_copy['bb_low'] = bollinger.bollinger_lband() + df_copy['bb_pct'] = bollinger.bollinger_pband() + + return df_copy + def calculate_pnl(self, predictions, actual_prices, position_size=1.0): """ Robust PnL calculator that handles: @@ -557,33 +627,33 @@ class DataInterface: # Calculate technical indicators try: # Add RSI (14) - df['rsi'] = ta.rsi(df['close'], length=14) + df['rsi'] = ta.momentum.rsi(df['close'], window=14) # Add MACD - macd = ta.macd(df['close']) - df['macd'] = macd['MACD_12_26_9'] - df['macd_signal'] = macd['MACDs_12_26_9'] - df['macd_hist'] = macd['MACDh_12_26_9'] + macd = ta.trend.MACD(df['close']) + df['macd'] = macd.macd() + df['macd_signal'] = macd.macd_signal() + df['macd_hist'] = macd.macd_diff() # Add Bollinger Bands - bbands = ta.bbands(df['close'], length=20) - df['bb_upper'] = bbands['BBU_20_2.0'] - df['bb_middle'] = bbands['BBM_20_2.0'] - df['bb_lower'] = bbands['BBL_20_2.0'] + bbands = ta.volatility.BollingerBands(df['close']) + df['bb_upper'] = bbands.bollinger_hband() + df['bb_middle'] = bbands.bollinger_mavg() + df['bb_lower'] = bbands.bollinger_lband() # Add ATR (Average True Range) - df['atr'] = ta.atr(df['high'], df['low'], df['close'], length=14) + df['atr'] = ta.volatility.average_true_range(df['high'], df['low'], df['close'], window=14) # Add moving averages - df['sma_20'] = ta.sma(df['close'], length=20) - df['sma_50'] = ta.sma(df['close'], length=50) - df['ema_20'] = ta.ema(df['close'], length=20) + df['sma_20'] = ta.trend.sma_indicator(df['close'], window=20) + df['sma_50'] = ta.trend.sma_indicator(df['close'], window=50) + df['ema_20'] = ta.trend.ema_indicator(df['close'], window=20) # Add OBV (On-Balance Volume) - df['obv'] = ta.obv(df['close'], df['volume']) + df['obv'] = ta.volume.on_balance_volume(df['close'], df['volume']) # Add momentum indicators - df['mom'] = ta.mom(df['close'], length=10) + df['mom'] = ta.momentum.roc(df['close'], window=10) # Normalize price to previous close df['close_norm'] = df['close'] / df['close'].shift(1) - 1