import torch import numpy as np from torch.utils.tensorboard import SummaryWriter import logging import time from datetime import datetime import os import sys import pandas as pd import gym # Add parent directory to path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from NN.utils.data_interface import DataInterface from NN.utils.trading_env import TradingEnvironment from NN.models.dqn_agent import DQNAgent from NN.utils.signal_interpreter import SignalInterpreter # Configure logging logger = logging.getLogger(__name__) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('rl_training.log'), logging.StreamHandler() ] ) class RLTradingEnvironment(TradingEnvironment): """Extended trading environment that reshapes state for CNN""" def __init__(self, data, window_size, num_features, num_timeframes, **kwargs): # Set attributes before parent initialization self.window_size = window_size self.num_features = num_features self.num_timeframes = num_timeframes self.feature_dim = num_features * num_timeframes # Initialize parent class super().__init__(data=data, **kwargs) # Update observation space for CNN self.observation_space = gym.spaces.Box( low=-np.inf, high=np.inf, shape=(self.window_size, self.feature_dim), dtype=np.float32 ) def _get_observation(self): """Get current observation reshaped for CNN""" # Get flattened observation from parent class flat_obs = super()._get_observation() # Extract features (exclude close price) features = flat_obs[:-1] # Remove close price # Calculate number of complete windows n_windows = len(features) // self.feature_dim if n_windows < self.window_size: # Pad with zeros if not enough data padding = np.zeros((self.window_size - n_windows, self.feature_dim)) reshaped = np.vstack([ padding, features[-(n_windows * self.feature_dim):].reshape(n_windows, self.feature_dim) ]) else: # Take the most recent window_size windows start_idx = (n_windows - self.window_size) * self.feature_dim reshaped = features[start_idx:].reshape(self.window_size, self.feature_dim) return reshaped.astype(np.float32) def train_rl(): """ Train the RL model using the DQN agent """ # Initialize data interface with BTC/USDT and multiple timeframes timeframes = ['1m', '5m', '15m'] window_size = 20 data_interface = DataInterface(symbol="BTC/USDT", timeframes=timeframes) # Get training data X_train, y_train, X_val, y_val, train_prices, val_prices = data_interface.prepare_training_data() if X_train is None: logger.error("Failed to get training data") return # Calculate feature dimensions num_features = X_train.shape[2] # Number of features per timeframe total_features = num_features * len(timeframes) # Total features across all timeframes # Flatten features for environment n_samples = X_train.shape[0] flattened_features = X_train.reshape(n_samples, window_size, -1) # Reshape to (batch, window, features) # Create DataFrame with features as separate columns feature_columns = [f'feature_{i}' for i in range(flattened_features.shape[2])] df = pd.DataFrame(flattened_features.reshape(n_samples, -1), columns=feature_columns * window_size) df['close'] = train_prices # Create environment env = RLTradingEnvironment( data=df, window_size=window_size, num_features=num_features, num_timeframes=len(timeframes), initial_balance=10000, fee_rate=0.001, max_steps=1000 ) # Create DQN agent agent = DQNAgent( state_size=window_size, # First dimension of observation space action_size=env.action_space.n, window_size=window_size, num_features=num_features, timeframes=timeframes, learning_rate=0.001, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995, memory_size=10000, batch_size=32, target_update=10 ) # Training parameters episodes = 1000 max_steps = 1000 best_reward = float('-inf') best_model_path = 'NN/models/saved/best_rl_model.pth' # Create models directory if it doesn't exist os.makedirs(os.path.dirname(best_model_path), exist_ok=True) # Training loop for episode in range(episodes): state = env.reset() total_reward = 0 for step in range(max_steps): # Get action from agent action = agent.act(state) # Take action in environment next_state, reward, done, info = env.step(action) # Store experience in agent's memory agent.remember(state, action, reward, next_state, done) # Train agent if len(agent.memory) > agent.batch_size: loss = agent.replay() if loss is not None: logger.debug(f"Training loss: {loss:.4f}") # Update state and reward state = next_state total_reward += reward if done: break # Update epsilon agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay) # Log episode results logger.info(f"Episode: {episode + 1}/{episodes}") logger.info(f"Total Reward: {total_reward:.2f}") logger.info(f"Final Balance: {info['balance']:.2f}") logger.info(f"Max Drawdown: {info['max_drawdown']:.2%}") logger.info(f"Win Rate: {info['win_rate']:.2%}") logger.info(f"Epsilon: {agent.epsilon:.4f}") # Save best model if total_reward > best_reward: best_reward = total_reward agent.save(best_model_path) logger.info(f"New best model saved with reward: {best_reward:.2f}") # Save checkpoint every 100 episodes if (episode + 1) % 100 == 0: checkpoint_path = f'NN/models/saved/rl_model_episode_{episode + 1}.pth' agent.save(checkpoint_path) logger.info(f"Checkpoint saved at episode {episode + 1}") if __name__ == "__main__": train_rl()