new training process and changes to the models (wip)

2025-04-01 18:43:26 +03:00
parent a78906a888
commit 902593b5f3
6 changed files with 5151 additions and 2635 deletions
--- a/NN/utils/trading_env.py
+++ b/NN/utils/trading_env.py
@@ -3,6 +3,10 @@ import gym
 from gym import spaces
 from typing import Dict, Tuple, List
 import pandas as pd
+import logging
+
+# Configure logger
+logger = logging.getLogger(__name__)

 class TradingEnvironment(gym.Env):
    """
@@ -12,97 +16,284 @@ class TradingEnvironment(gym.Env):
                 data: pd.DataFrame,
                 initial_balance: float = 100.0,
                 fee_rate: float = 0.0002,
-                 max_steps: int = 1000):
+                 max_steps: int = 1000,
+                 window_size: int = 20,
+                 risk_aversion: float = 0.2,  # Controls how much to penalize volatility
+                 price_scaling: str = 'zscore',  # 'zscore', 'minmax', or 'raw'
+                 reward_scaling: float = 10.0,   # Scale factor for rewards
+                 episode_penalty: float = 0.1):  # Penalty for active positions at end of episode
        super(TradingEnvironment, self).__init__()
        
        self.data = data
        self.initial_balance = initial_balance
        self.fee_rate = fee_rate
        self.max_steps = max_steps
+        self.window_size = window_size
+        self.risk_aversion = risk_aversion
+        self.price_scaling = price_scaling
+        self.reward_scaling = reward_scaling
+        self.episode_penalty = episode_penalty
        
-        # Action space: 0 (SELL), 1 (HOLD), 2 (BUY)
+        # Preprocess data if needed
+        self._preprocess_data()
+        
+        # Action space: 0 (BUY), 1 (SELL), 2 (HOLD)
        self.action_space = spaces.Discrete(3)
        
        # Observation space: price data, technical indicators, and account state
+        feature_dim = self.data.shape[1] + 3  # Adding position, equity, unrealized_pnl
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
-            shape=(data.shape[1],),  # Number of features
+            shape=(feature_dim,),
            dtype=np.float32
        )
        
        # Initialize state
        self.reset()
    
+    def _preprocess_data(self):
+        """Preprocess data - normalize or standardize features"""
+        # Store the original data for reference
+        self.original_data = self.data.copy()
+        
+        # Normalize price data based on the selected method
+        if self.price_scaling == 'zscore':
+            # For each feature, apply z-score normalization 
+            for col in self.data.columns:
+                if col in ['open', 'high', 'low', 'close']:
+                    mean = self.data[col].mean()
+                    std = self.data[col].std()
+                    if std > 0:
+                        self.data[col] = (self.data[col] - mean) / std
+                # Normalize volume separately
+                elif col == 'volume':
+                    mean = self.data[col].mean()
+                    std = self.data[col].std()
+                    if std > 0:
+                        self.data[col] = (self.data[col] - mean) / std
+        
+        elif self.price_scaling == 'minmax':
+            # For each feature, apply min-max scaling
+            for col in self.data.columns:
+                min_val = self.data[col].min()
+                max_val = self.data[col].max()
+                if max_val > min_val:
+                    self.data[col] = (self.data[col] - min_val) / (max_val - min_val)
+    
    def reset(self) -> np.ndarray:
        """Reset the environment to initial state"""
-        self.current_step = 0
+        self.current_step = self.window_size
        self.balance = self.initial_balance
-        self.position = 0  # 0: no position, 1: long position
+        self.position = 0  # 0: no position, 1: long position, -1: short position
        self.entry_price = 0
+        self.entry_time = 0
        self.total_trades = 0
        self.winning_trades = 0
+        self.losing_trades = 0
        self.total_pnl = 0
        self.balance_history = [self.initial_balance]
+        self.equity_history = [self.initial_balance]
        self.max_balance = self.initial_balance
+        self.max_drawdown = 0
+        
+        # Trading performance metrics
+        self.trade_durations = []  # Track how long trades are held
+        self.returns = []          # Track returns of each trade
+        
+        # For analyzing trade clustering
+        self.last_action_time = 0
+        self.actions_taken = []
        
        return self._get_observation()
    
    def _get_observation(self) -> np.ndarray:
-        """Get current observation state"""
-        return self.data.iloc[self.current_step].values
+        """Get current observation state with account information"""
+        # Get market data for the current step
+        market_data = self.data.iloc[self.current_step].values
+        
+        # Get current price
+        current_price = self.original_data.iloc[self.current_step]['close']
+        
+        # Calculate unrealized PnL
+        unrealized_pnl = 0
+        if self.position != 0:
+            price_diff = current_price - self.entry_price
+            unrealized_pnl = self.position * price_diff
+        
+        # Calculate total equity (balance + unrealized PnL)
+        equity = self.balance + unrealized_pnl
+        
+        # Normalize account state
+        normalized_position = self.position  # -1, 0, or 1
+        normalized_equity = equity / self.initial_balance - 1.0  # Percent change from initial
+        normalized_unrealized_pnl = unrealized_pnl / self.initial_balance if self.initial_balance > 0 else 0
+        
+        # Combine market data with account state
+        account_state = np.array([normalized_position, normalized_equity, normalized_unrealized_pnl])
+        observation = np.concatenate([market_data, account_state])
+        
+        # Handle any NaN values
+        observation = np.nan_to_num(observation, nan=0.0)
+        
+        return observation
    
    def _calculate_reward(self, action: int) -> float:
-        """Calculate reward based on action and outcome"""
-        current_price = self.data.iloc[self.current_step]['close']
+        """
+        Calculate reward based on action and outcome with improved risk-adjusted metrics
        
-        # If we have an open position
-        if self.position != 0:
-            # Calculate PnL
-            pnl = self.position * (current_price - self.entry_price) / self.entry_price
-            fees = self.fee_rate * 2  # Entry and exit fees
+        Args:
+            action: The action taken (0=BUY, 1=SELL, 2=HOLD)
            
-            # Close position
-            if (action == 0 and self.position > 0) or (action == 2 and self.position < 0):
-                net_pnl = pnl - fees
-                self.total_pnl += net_pnl
-                self.balance *= (1 + net_pnl)
+        Returns:
+            float: Calculated reward value
+        """
+        # Get current price and next price
+        current_price = self.original_data.iloc[self.current_step]['close']
+        
+        # Default reward is slightly negative to discourage excessive trading
+        reward = -0.0001
+        pnl = 0.0
+        
+        # Handle different actions based on current position
+        if self.position == 0:  # No position
+            if action == 0:  # BUY
+                self.position = 1
+                self.entry_price = current_price
+                self.entry_time = self.current_step
+                reward = -self.fee_rate  # Small penalty for trading cost
+                
+            elif action == 1:  # SELL (start short position)
+                self.position = -1
+                self.entry_price = current_price
+                self.entry_time = self.current_step
+                reward = -self.fee_rate  # Small penalty for trading cost
+                
+            # else action == 2 (HOLD) - keep the small negative reward
+            
+        elif self.position > 0:  # Long position
+            if action == 1:  # SELL (close long)
+                # Calculate profit/loss
+                price_diff = current_price - self.entry_price
+                pnl = price_diff / self.entry_price - 2 * self.fee_rate  # Account for entry and exit fees
+                
+                # Adjust reward based on PnL and risk
+                reward = pnl * self.reward_scaling
+                
+                # Track trade performance
+                self.total_trades += 1
+                if pnl > 0:
+                    self.winning_trades += 1
+                else:
+                    self.losing_trades += 1
+                    
+                # Calculate trade duration
+                trade_duration = self.current_step - self.entry_time
+                self.trade_durations.append(trade_duration)
+                
+                # Update returns list
+                self.returns.append(pnl)
+                
+                # Update balance and reset position
+                self.balance *= (1 + pnl)
                self.balance_history.append(self.balance)
                self.max_balance = max(self.max_balance, self.balance)
+                self.total_pnl += pnl
                
-                self.total_trades += 1
-                if net_pnl > 0:
-                    self.winning_trades += 1
-                
-                # Reward based on PnL
-                reward = net_pnl * 100  # Scale up for better learning
-                
-                # Additional reward for win rate
-                win_rate = self.winning_trades / max(1, self.total_trades)
-                reward += win_rate * 0.1
-                
+                # Reset position
                self.position = 0
-                return reward
+                
+            elif action == 0:  # BUY (while already long)
+                # Penalize trying to increase an already active position
+                reward = -0.001
+                
+            # else action == 2 (HOLD) - calculate unrealized P&L for reward
+            else:
+                price_diff = current_price - self.entry_price
+                unrealized_pnl = price_diff / self.entry_price
+                
+                # Small reward/penalty based on unrealized P&L
+                reward = unrealized_pnl * 0.05  # Scale down to encourage holding good positions
+        
+        elif self.position < 0:  # Short position
+            if action == 0:  # BUY (close short)
+                # Calculate profit/loss
+                price_diff = self.entry_price - current_price
+                pnl = price_diff / self.entry_price - 2 * self.fee_rate  # Account for entry and exit fees
+                
+                # Adjust reward based on PnL and risk
+                reward = pnl * self.reward_scaling
+                
+                # Track trade performance
+                self.total_trades += 1
+                if pnl > 0:
+                    self.winning_trades += 1
+                else:
+                    self.losing_trades += 1
+                    
+                # Calculate trade duration
+                trade_duration = self.current_step - self.entry_time
+                self.trade_durations.append(trade_duration)
+                
+                # Update returns list
+                self.returns.append(pnl)
+                
+                # Update balance and reset position
+                self.balance *= (1 + pnl)
+                self.balance_history.append(self.balance)
+                self.max_balance = max(self.max_balance, self.balance)
+                self.total_pnl += pnl
+                
+                # Reset position
+                self.position = 0
+                
+            elif action == 1:  # SELL (while already short)
+                # Penalize trying to increase an already active position
+                reward = -0.001
+                
+            # else action == 2 (HOLD) - calculate unrealized P&L for reward
+            else:
+                price_diff = self.entry_price - current_price
+                unrealized_pnl = price_diff / self.entry_price
+                
+                # Small reward/penalty based on unrealized P&L
+                reward = unrealized_pnl * 0.05  # Scale down to encourage holding good positions
+        
+        # Record the action
+        self.actions_taken.append(action)
+        self.last_action_time = self.current_step
+        
+        # Update equity history (balance + unrealized P&L)
+        current_equity = self.balance
+        if self.position != 0:
+            # Calculate unrealized P&L
+            if self.position > 0:  # Long
+                price_diff = current_price - self.entry_price
+                unrealized_pnl = price_diff / self.entry_price * self.balance
+            else:  # Short
+                price_diff = self.entry_price - current_price
+                unrealized_pnl = price_diff / self.entry_price * self.balance
+                
+            current_equity = self.balance + unrealized_pnl
            
-            # Hold position
-            return pnl * 0.1  # Small reward for holding profitable positions
+        self.equity_history.append(current_equity)
        
-        # No position
-        if action == 1:  # HOLD
-            return 0
+        # Calculate current drawdown
+        peak_equity = max(self.equity_history)
+        current_drawdown = (peak_equity - current_equity) / peak_equity if peak_equity > 0 else 0
+        self.max_drawdown = max(self.max_drawdown, current_drawdown)
        
-        # Open new position
-        if action in [0, 2]:  # SELL or BUY
-            self.position = -1 if action == 0 else 1
-            self.entry_price = current_price
-            return -self.fee_rate  # Small penalty for trading
+        # Apply risk aversion factor - penalize volatility
+        if len(self.returns) > 1:
+            returns_std = np.std(self.returns)
+            reward -= returns_std * self.risk_aversion
        
-        return 0
+        return reward, pnl
    
    def step(self, action: int) -> Tuple[np.ndarray, float, bool, Dict]:
        """Execute one step in the environment"""
-        # Calculate reward
-        reward = self._calculate_reward(action)
+        # Calculate reward and update state
+        reward, pnl = self._calculate_reward(action)
        
        # Move to next step
        self.current_step += 1
@@ -110,26 +301,70 @@ class TradingEnvironment(gym.Env):
        # Check if episode is done
        done = self.current_step >= min(self.max_steps - 1, len(self.data) - 1)
        
+        # Apply penalty if episode ends with open position
+        if done and self.position != 0:
+            reward -= self.episode_penalty
+            
+            # Force close the position at the end if still open
+            current_price = self.original_data.iloc[self.current_step]['close']
+            if self.position > 0:  # Long position
+                price_diff = current_price - self.entry_price
+                pnl = price_diff / self.entry_price - 2 * self.fee_rate
+            else:  # Short position
+                price_diff = self.entry_price - current_price
+                pnl = price_diff / self.entry_price - 2 * self.fee_rate
+                
+            # Update balance
+            self.balance *= (1 + pnl)
+            self.total_pnl += pnl
+            
+            # Track trade
+            self.total_trades += 1
+            if pnl > 0:
+                self.winning_trades += 1
+            else:
+                self.losing_trades += 1
+            
+            # Reset position
+            self.position = 0
+        
        # Get next observation
        observation = self._get_observation()
        
-        # Calculate max drawdown
-        max_drawdown = 0
-        if len(self.balance_history) > 1:
-            peak = self.balance_history[0]
-            for balance in self.balance_history:
-                peak = max(peak, balance)
-                drawdown = (peak - balance) / peak
-                max_drawdown = max(max_drawdown, drawdown)
+        # Calculate sharpe ratio and sortino ratio if possible
+        sharpe_ratio = 0
+        sortino_ratio = 0
+        win_rate = self.winning_trades / max(1, self.total_trades)
+        
+        if len(self.returns) > 1:
+            mean_return = np.mean(self.returns)
+            std_return = np.std(self.returns)
+            if std_return > 0:
+                sharpe_ratio = mean_return / std_return
+                
+            # For sortino, we only consider downside deviation
+            downside_returns = [r for r in self.returns if r < 0]
+            if downside_returns:
+                downside_deviation = np.std(downside_returns)
+                if downside_deviation > 0:
+                    sortino_ratio = mean_return / downside_deviation
+        
+        # Calculate average trade duration
+        avg_trade_duration = np.mean(self.trade_durations) if self.trade_durations else 0
        
        # Additional info
        info = {
            'balance': self.balance,
            'position': self.position,
            'total_trades': self.total_trades,
-            'win_rate': self.winning_trades / max(1, self.total_trades),
+            'win_rate': win_rate,
            'total_pnl': self.total_pnl,
-            'max_drawdown': max_drawdown
+            'max_drawdown': self.max_drawdown,
+            'sharpe_ratio': sharpe_ratio,
+            'sortino_ratio': sortino_ratio,
+            'avg_trade_duration': avg_trade_duration,
+            'pnl': pnl,
+            'gain': (self.balance - self.initial_balance) / self.initial_balance
        }
        
        return observation, reward, done, info
@@ -143,20 +378,19 @@ class TradingEnvironment(gym.Env):
            print(f"Total Trades: {self.total_trades}")
            print(f"Win Rate: {self.winning_trades/max(1, self.total_trades):.2%}")
            print(f"Total PnL: ${self.total_pnl:.2f}")
-            print(f"Max Drawdown: {self._calculate_max_drawdown():.2%}")
+            print(f"Max Drawdown: {self.max_drawdown:.2%}")
+            print(f"Sharpe Ratio: {self._calculate_sharpe_ratio():.4f}")
            print("-" * 50)
-            
-    def _calculate_max_drawdown(self):
-        """Calculate maximum drawdown from balance history"""
-        if len(self.balance_history) <= 1:
+    
+    def _calculate_sharpe_ratio(self):
+        """Calculate Sharpe ratio from returns"""
+        if len(self.returns) < 2:
            return 0.0
            
-        peak = self.balance_history[0]
-        max_drawdown = 0.0
+        mean_return = np.mean(self.returns)
+        std_return = np.std(self.returns)
        
-        for balance in self.balance_history:
-            peak = max(peak, balance)
-            drawdown = (peak - balance) / peak
-            max_drawdown = max(max_drawdown, drawdown)
+        if std_return == 0:
+            return 0.0
            
-        return max_drawdown 
+        return mean_return / std_return