cleanup, cob ladder still broken

2025-07-07 01:07:48 +03:00
parent 97d9bc97ee
commit 9101448e78
75 changed files with 546 additions and 12123 deletions
--- a/utils/reward_calculator.py
+++ b/utils/reward_calculator.py
@@ -9,215 +9,166 @@ rewards for successful holding of positions.
 import numpy as np
 from datetime import datetime, timedelta
 from collections import deque
+import logging

-class ImprovedRewardCalculator:
-    def __init__(self, 
-                 max_drawdown_pct=0.1,         # Maximum drawdown %
-                 risk_reward_ratio=1.5,        # Risk-reward ratio
-                 base_fee_rate=0.0002,         # 0.02% per transaction
-                 max_frequency_penalty=0.005,    # Maximum 0.5% penalty for frequent trading
-                 holding_reward_rate=0.0001,     # Small reward for holding profitable positions
-                 risk_adjusted=True,            # Use Sharpe ratio for risk adjustment
-                 base_reward=1.0,               # Base reward scale
-                 profit_factor=2.0,             # Profit reward multiplier
-                 loss_factor=1.0,               # Loss penalty multiplier
-                 trade_frequency_penalty=0.3,   # Penalty for frequent trading
-                 position_duration_factor=0.05  # Reward for longer positions
-                ):            
-        
+logger = logging.getLogger(__name__)
+
+class RewardCalculator:
+    def __init__(self, base_fee_rate=0.001, reward_scaling=10.0, risk_aversion=0.1):
        self.base_fee_rate = base_fee_rate
-        self.max_frequency_penalty = max_frequency_penalty
-        self.holding_reward_rate = holding_reward_rate
-        self.risk_adjusted = risk_adjusted
-        
-        # New parameters
-        self.base_reward = base_reward
-        self.profit_factor = profit_factor
-        self.loss_factor = loss_factor
-        self.trade_frequency_penalty = trade_frequency_penalty
-        self.position_duration_factor = position_duration_factor
-        
-        # Keep track of recent trades
-        self.recent_trades = deque(maxlen=1000)
-        self.trade_pnls = deque(maxlen=100)  # For risk adjustment
-        
-        # Additional tracking metrics
-        self.total_trades = 0
-        self.profitable_trades = 0
-        self.total_pnl = 0.0
-        self.daily_pnl = {}
-        self.hourly_pnl = {}
-        
-    def record_trade(self, timestamp=None, action=None, price=None):
-        """Record a trade for frequency tracking"""
-        if timestamp is None:
-            timestamp = datetime.now()
-            
-        self.recent_trades.append({
-            'timestamp': timestamp,
-            'action': action,
-            'price': price
-        })
-    
+        self.reward_scaling = reward_scaling
+        self.risk_aversion = risk_aversion
+        self.trade_pnls = []
+        self.returns = []
+        self.trade_timestamps = []
+        self.frequency_threshold = 10  # Trades per minute threshold for penalty
+        self.max_frequency_penalty = 0.05
+
    def record_pnl(self, pnl):
-        """Record a PnL result for risk adjustment and tracking metrics"""
+        """Record P&L for risk adjustment calculations"""
        self.trade_pnls.append(pnl)
-        
-        # Update overall metrics
-        self.total_trades += 1
-        self.total_pnl += pnl
-        
-        if pnl > 0:
-            self.profitable_trades += 1
-            
-        # Track daily and hourly PnL
-        now = datetime.now()
-        day_key = now.strftime('%Y-%m-%d')
-        hour_key = now.strftime('%Y-%m-%d %H:00')
-        
-        # Update daily PnL
-        if day_key not in self.daily_pnl:
-            self.daily_pnl[day_key] = 0.0
-        self.daily_pnl[day_key] += pnl
-        
-        # Update hourly PnL
-        if hour_key not in self.hourly_pnl:
-            self.hourly_pnl[hour_key] = 0.0
-        self.hourly_pnl[hour_key] += pnl
-    
+        if len(self.trade_pnls) > 100:
+            self.trade_pnls.pop(0)
+
+    def record_trade(self, action):
+        """Record trade action for frequency penalty calculations"""
+        from time import time
+        self.trade_timestamps.append(time())
+        if len(self.trade_timestamps) > 100:
+            self.trade_timestamps.pop(0)
+
    def _calculate_frequency_penalty(self):
-        """Calculate penalty for trading too frequently"""
-        if len(self.recent_trades) < 2:
+        """Calculate penalty for high-frequency trading"""
+        if len(self.trade_timestamps) < 2:
            return 0.0
-            
-        # Count trades in the last minute
-        now = datetime.now()
-        one_minute_ago = now - timedelta(minutes=1)
-        trades_last_minute = sum(1 for trade in self.recent_trades 
-                              if trade['timestamp'] > one_minute_ago)
-        
-        # Apply progressive penalty (more severe as frequency increases)
-        if trades_last_minute <= 1:
-            return 0.0  # No penalty for normal trading rate
-        
-        # Progressive penalty based on trade frequency
-        penalty = min(self.max_frequency_penalty, 
-                      self.base_fee_rate * trades_last_minute)
-        
-        return penalty
-    
-    def _calculate_holding_reward(self, position_held_time, price_change_pct):
-        """Calculate reward for holding a position for some time"""
-        if position_held_time <= 0 or price_change_pct <= 0:
-            return 0.0  # No reward for unprofitable holds
-            
-        # Cap at 100 time units (seconds, minutes, etc.)
-        capped_time = min(position_held_time, 100)
-        
-        # Scale reward by both time and price change
-        reward = self.holding_reward_rate * capped_time * price_change_pct
-        
-        return reward
-    
+        time_span = self.trade_timestamps[-1] - self.trade_timestamps[0]
+        if time_span <= 0:
+            return 0.0
+        trades_per_minute = (len(self.trade_timestamps) / time_span) * 60
+        if trades_per_minute > self.frequency_threshold:
+            penalty = min(self.max_frequency_penalty, (trades_per_minute - self.frequency_threshold) * 0.001)
+            return penalty
+        return 0.0
+
    def _calculate_risk_adjustment(self, reward):
        """Adjust rewards based on risk (simple Sharpe ratio implementation)"""
        if len(self.trade_pnls) < 5:
-            return reward  # Not enough data for adjustment
-            
-        # Calculate mean and standard deviation of returns
+            return reward
        pnl_array = np.array(self.trade_pnls)
        mean_return = np.mean(pnl_array)
        std_return = np.std(pnl_array)
-        
        if std_return == 0:
-            return reward  # Avoid division by zero
-            
-        # Simplified Sharpe ratio
+            return reward
        sharpe = mean_return / std_return
-        
-        # Scale reward by Sharpe ratio (normalized to be around 1.0)
        adjustment_factor = np.clip(1.0 + 0.5 * sharpe, 0.5, 2.0)
-        
        return reward * adjustment_factor
-    
-    def calculate_reward(self, action, price_change, position_held_time=0, 
-                         volatility=None, is_profitable=False):
-        """
-        Calculate the improved reward
-        
-        Args:
-            action (int): 0 = Buy, 1 = Sell, 2 = Hold
-            price_change (float): Percent price change for the trade
-            position_held_time (int): Time position was held (in time units)
-            volatility (float, optional): Market volatility measure
-            is_profitable (bool): Whether current position is profitable
-            
-        Returns:
-            float: Calculated reward value
-        """
-        # Calculate trading fee
+
+    def _calculate_holding_reward(self, position_held_time, price_change):
+        """Calculate reward for holding a position"""
+        base_holding_reward = 0.0005 * (position_held_time / 60.0)
+        if price_change > 0:
+            return base_holding_reward * 2
+        elif price_change < 0:
+            return base_holding_reward * 0.5
+        return base_holding_reward
+
+    def calculate_basic_reward(self, pnl, confidence):
+        """Calculate basic training reward based on P&L and confidence"""
+        try:
+            base_reward = pnl
+            if pnl < 0 and confidence > 0.7:
+                confidence_adjustment = -confidence * 2
+            elif pnl > 0 and confidence > 0.7:
+                confidence_adjustment = confidence * 1.5
+            else:
+                confidence_adjustment = 0
+            final_reward = base_reward + confidence_adjustment
+            normalized_reward = np.tanh(final_reward / 10.0)
+            logger.debug(f"Basic reward calculation: P&L={pnl:.4f}, confidence={confidence:.2f}, reward={normalized_reward:.4f}")
+            return float(normalized_reward)
+        except Exception as e:
+            logger.error(f"Error calculating basic reward: {e}")
+            return 0.0
+
+    def calculate_enhanced_reward(self, action, price_change, position_held_time=0, volatility=None, is_profitable=False, confidence=0.0, predicted_change=0.0, actual_change=0.0, current_pnl=0.0, symbol='UNKNOWN'):
+        """Calculate enhanced reward for trading actions"""
        fee = self.base_fee_rate
-        
-        # Calculate frequency penalty
        frequency_penalty = self._calculate_frequency_penalty()
-        
-        # Base reward calculation
        if action == 0:  # Buy
-            # Small penalty for transaction plus frequency penalty
            reward = -fee - frequency_penalty
-            
        elif action == 1:  # Sell
-            # Calculate profit percentage minus fees (both entry and exit)
            profit_pct = price_change
            net_profit = profit_pct - (fee * 2)
-            
-            # Scale reward and apply frequency penalty
-            reward = net_profit * 10  # Scale reward
+            reward = net_profit * self.reward_scaling
            reward -= frequency_penalty
-            
-            # Record PnL for risk adjustment
            self.record_pnl(net_profit)
-            
        else:  # Hold
-            # Small reward for holding a profitable position, small cost otherwise
            if is_profitable:
                reward = self._calculate_holding_reward(position_held_time, price_change)
            else:
-                reward = -0.0001  # Very small negative reward
-        
-        # Apply risk adjustment if enabled
-        if self.risk_adjusted:
-            reward = self._calculate_risk_adjustment(reward)
-            
-        # Record this action for future frequency calculations
-        self.record_trade(action=action)
-        
+                reward = -0.0001
+        if action in [0, 1] and predicted_change != 0:
+            if (action == 0 and actual_change > 0) or (action == 1 and actual_change < 0):
+                reward += abs(actual_change) * 5.0
+            else:
+                reward -= abs(predicted_change) * 2.0
+        reward += current_pnl * 0.1
+        if volatility is not None:
+            reward -= abs(volatility) * 100
+        if self.risk_aversion > 0 and len(self.returns) > 1:
+            returns_std = np.std(self.returns)
+            reward -= returns_std * self.risk_aversion
+        self.record_trade(action)
+        return reward
+
+    def calculate_prediction_reward(self, symbol, predicted_direction, actual_direction, confidence, predicted_change, actual_change, current_pnl=0.0, position_duration=0.0):
+        """Calculate reward for prediction accuracy"""
+        reward = 0.0
+        if predicted_direction == actual_direction:
+            reward += 1.0 * confidence
+        else:
+            reward -= 0.5
+        if predicted_direction == actual_direction and abs(predicted_change) > 0.001:
+            reward += abs(actual_change) * 5.0
+        if predicted_direction != actual_direction and abs(predicted_change) > 0.001:
+            reward -= abs(predicted_change) * 2.0
+        reward += current_pnl * 0.1
+        # Dynamic adjustment based on recent PnL (loss cutting incentive)
+        if hasattr(self, 'pnl_history') and symbol in self.pnl_history and self.pnl_history[symbol]:
+            latest_pnl_entry = self.pnl_history[symbol][-1]
+            latest_pnl_value = latest_pnl_entry.get('pnl', 0.0) if isinstance(latest_pnl_entry, dict) else 0.0
+            if latest_pnl_value < 0 and position_duration > 60:
+                reward -= (abs(latest_pnl_value) * 0.2)
+            pnl_values = [entry.get('pnl', 0.0) for entry in self.pnl_history[symbol] if isinstance(entry, dict)]
+            best_pnl = max(pnl_values) if pnl_values else 0.0
+            if best_pnl < 0.0:
+                reward -= 0.1
        return reward

 # Example usage:
 if __name__ == "__main__":
    # Create calculator instance
-    reward_calc = ImprovedRewardCalculator()
+    reward_calc = RewardCalculator()
    
    # Example reward for a buy action
-    buy_reward = reward_calc.calculate_reward(action=0, price_change=0)
+    buy_reward = reward_calc.calculate_enhanced_reward(action=0, price_change=0)
    print(f"Buy action reward: {buy_reward:.5f}")
    
    # Record a trade for frequency tracking
-    reward_calc.record_trade(action=0)
+    reward_calc.record_trade(0)
    
    # Wait a bit and make another trade to test frequency penalty
    import time
    time.sleep(0.1)
    
    # Example reward for a sell action with profit
-    sell_reward = reward_calc.calculate_reward(action=1, price_change=0.015, position_held_time=60)
+    sell_reward = reward_calc.calculate_enhanced_reward(action=1, price_change=0.015, position_held_time=60)
    print(f"Sell action reward (with profit): {sell_reward:.5f}")
    
    # Example reward for a hold action on profitable position
-    hold_reward = reward_calc.calculate_reward(action=2, price_change=0.01, position_held_time=30, is_profitable=True)
+    hold_reward = reward_calc.calculate_enhanced_reward(action=2, price_change=0.01, position_held_time=30, is_profitable=True)
    print(f"Hold action reward (profitable): {hold_reward:.5f}")
    
    # Example reward for a hold action on unprofitable position
-    hold_reward_neg = reward_calc.calculate_reward(action=2, price_change=-0.01, position_held_time=30, is_profitable=False)
+    hold_reward_neg = reward_calc.calculate_enhanced_reward(action=2, price_change=-0.01, position_held_time=30, is_profitable=False)
    print(f"Hold action reward (unprofitable): {hold_reward_neg:.5f}")