RL training

2025-03-31 03:31:54 +03:00
parent 1610d5bd49
commit 4eac14022c
9 changed files with 1492 additions and 247 deletions
--- a/NN/train_rl.py
+++ b/NN/train_rl.py
@@ -0,0 +1,192 @@
+import torch
+import numpy as np
+from torch.utils.tensorboard import SummaryWriter
+import logging
+import time
+from datetime import datetime
+import os
+import sys
+import pandas as pd
+import gym
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from NN.utils.data_interface import DataInterface
+from NN.utils.trading_env import TradingEnvironment
+from NN.models.dqn_agent import DQNAgent
+from NN.utils.signal_interpreter import SignalInterpreter
+
+# Configure logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('rl_training.log'),
+        logging.StreamHandler()
+    ]
+)
+
+class RLTradingEnvironment(TradingEnvironment):
+    """Extended trading environment that reshapes state for CNN"""
+    def __init__(self, data, window_size, num_features, num_timeframes, **kwargs):
+        # Set attributes before parent initialization
+        self.window_size = window_size
+        self.num_features = num_features
+        self.num_timeframes = num_timeframes
+        self.feature_dim = num_features * num_timeframes
+        
+        # Initialize parent class
+        super().__init__(data=data, **kwargs)
+        
+        # Update observation space for CNN
+        self.observation_space = gym.spaces.Box(
+            low=-np.inf,
+            high=np.inf,
+            shape=(self.window_size, self.feature_dim),
+            dtype=np.float32
+        )
+        
+    def _get_observation(self):
+        """Get current observation reshaped for CNN"""
+        # Get flattened observation from parent class
+        flat_obs = super()._get_observation()
+        
+        # Extract features (exclude close price)
+        features = flat_obs[:-1]  # Remove close price
+        
+        # Calculate number of complete windows
+        n_windows = len(features) // self.feature_dim
+        if n_windows < self.window_size:
+            # Pad with zeros if not enough data
+            padding = np.zeros((self.window_size - n_windows, self.feature_dim))
+            reshaped = np.vstack([
+                padding,
+                features[-(n_windows * self.feature_dim):].reshape(n_windows, self.feature_dim)
+            ])
+        else:
+            # Take the most recent window_size windows
+            start_idx = (n_windows - self.window_size) * self.feature_dim
+            reshaped = features[start_idx:].reshape(self.window_size, self.feature_dim)
+        
+        return reshaped.astype(np.float32)
+
+def train_rl():
+    """
+    Train the RL model using the DQN agent
+    """
+    # Initialize data interface with BTC/USDT and multiple timeframes
+    timeframes = ['1m', '5m', '15m']
+    window_size = 20
+    data_interface = DataInterface(symbol="BTC/USDT", timeframes=timeframes)
+    
+    # Get training data
+    X_train, y_train, X_val, y_val, train_prices, val_prices = data_interface.prepare_training_data()
+    if X_train is None:
+        logger.error("Failed to get training data")
+        return
+    
+    # Calculate feature dimensions
+    num_features = X_train.shape[2]  # Number of features per timeframe
+    total_features = num_features * len(timeframes)  # Total features across all timeframes
+    
+    # Flatten features for environment
+    n_samples = X_train.shape[0]
+    flattened_features = X_train.reshape(n_samples, window_size, -1)  # Reshape to (batch, window, features)
+    
+    # Create DataFrame with features as separate columns
+    feature_columns = [f'feature_{i}' for i in range(flattened_features.shape[2])]
+    df = pd.DataFrame(flattened_features.reshape(n_samples, -1), columns=feature_columns * window_size)
+    df['close'] = train_prices
+    
+    # Create environment
+    env = RLTradingEnvironment(
+        data=df,
+        window_size=window_size,
+        num_features=num_features,
+        num_timeframes=len(timeframes),
+        initial_balance=10000,
+        fee_rate=0.001,
+        max_steps=1000
+    )
+    
+    # Create DQN agent
+    agent = DQNAgent(
+        state_size=window_size,  # First dimension of observation space
+        action_size=env.action_space.n,
+        window_size=window_size,
+        num_features=num_features,
+        timeframes=timeframes,
+        learning_rate=0.001,
+        gamma=0.99,
+        epsilon=1.0,
+        epsilon_min=0.01,
+        epsilon_decay=0.995,
+        memory_size=10000,
+        batch_size=32,
+        target_update=10
+    )
+    
+    # Training parameters
+    episodes = 1000
+    max_steps = 1000
+    best_reward = float('-inf')
+    best_model_path = 'NN/models/saved/best_rl_model.pth'
+    
+    # Create models directory if it doesn't exist
+    os.makedirs(os.path.dirname(best_model_path), exist_ok=True)
+    
+    # Training loop
+    for episode in range(episodes):
+        state = env.reset()
+        total_reward = 0
+        
+        for step in range(max_steps):
+            # Get action from agent
+            action = agent.act(state)
+            
+            # Take action in environment
+            next_state, reward, done, info = env.step(action)
+            
+            # Store experience in agent's memory
+            agent.remember(state, action, reward, next_state, done)
+            
+            # Train agent
+            if len(agent.memory) > agent.batch_size:
+                loss = agent.replay()
+                if loss is not None:
+                    logger.debug(f"Training loss: {loss:.4f}")
+            
+            # Update state and reward
+            state = next_state
+            total_reward += reward
+            
+            if done:
+                break
+        
+        # Update epsilon
+        agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay)
+        
+        # Log episode results
+        logger.info(f"Episode: {episode + 1}/{episodes}")
+        logger.info(f"Total Reward: {total_reward:.2f}")
+        logger.info(f"Final Balance: {info['balance']:.2f}")
+        logger.info(f"Max Drawdown: {info['max_drawdown']:.2%}")
+        logger.info(f"Win Rate: {info['win_rate']:.2%}")
+        logger.info(f"Epsilon: {agent.epsilon:.4f}")
+        
+        # Save best model
+        if total_reward > best_reward:
+            best_reward = total_reward
+            agent.save(best_model_path)
+            logger.info(f"New best model saved with reward: {best_reward:.2f}")
+        
+        # Save checkpoint every 100 episodes
+        if (episode + 1) % 100 == 0:
+            checkpoint_path = f'NN/models/saved/rl_model_episode_{episode + 1}.pth'
+            agent.save(checkpoint_path)
+            logger.info(f"Checkpoint saved at episode {episode + 1}")
+
+if __name__ == "__main__":
+    train_rl()