gogo2/utils/reward_calculator.py

"""
Improved Reward Function for RL Trading Agent

This module provides a more sophisticated reward function for the RL trading agent
that incorporates realistic trading fees, penalties for excessive trading, and
rewards for successful holding of positions.
"""

import numpy as np
from datetime import datetime, timedelta
from collections import deque
import logging

logger = logging.getLogger(__name__)

class RewardCalculator:
    def __init__(self, base_fee_rate=0.001, reward_scaling=10.0, risk_aversion=0.1):
        self.base_fee_rate = base_fee_rate
        self.reward_scaling = reward_scaling
        self.risk_aversion = risk_aversion
        self.trade_pnls = []
        self.returns = []
        self.trade_timestamps = []
        self.frequency_threshold = 10  # Trades per minute threshold for penalty
        self.max_frequency_penalty = 0.05

    def record_pnl(self, pnl):
        """Record P&L for risk adjustment calculations"""
        self.trade_pnls.append(pnl)
        if len(self.trade_pnls) > 100:
            self.trade_pnls.pop(0)

    def record_trade(self, action):
        """Record trade action for frequency penalty calculations"""
        from time import time
        self.trade_timestamps.append(time())
        if len(self.trade_timestamps) > 100:
            self.trade_timestamps.pop(0)

    def _calculate_frequency_penalty(self):
        """Calculate penalty for high-frequency trading"""
        if len(self.trade_timestamps) < 2:
            return 0.0
        time_span = self.trade_timestamps[-1] - self.trade_timestamps[0]
        if time_span <= 0:
            return 0.0
        trades_per_minute = (len(self.trade_timestamps) / time_span) * 60
        if trades_per_minute > self.frequency_threshold:
            penalty = min(self.max_frequency_penalty, (trades_per_minute - self.frequency_threshold) * 0.001)
        return penalty
        return 0.0

    def _calculate_risk_adjustment(self, reward):
        """Adjust rewards based on risk (simple Sharpe ratio implementation)"""
        if len(self.trade_pnls) < 5:
            return reward
        pnl_array = np.array(self.trade_pnls)
        mean_return = np.mean(pnl_array)
        std_return = np.std(pnl_array)
        if std_return == 0:
            return reward
        sharpe = mean_return / std_return
        adjustment_factor = np.clip(1.0 + 0.5 * sharpe, 0.5, 2.0)
        return reward * adjustment_factor

    def _calculate_holding_reward(self, position_held_time, price_change):
        """Calculate reward for holding a position"""
        base_holding_reward = 0.0005 * (position_held_time / 60.0)
        if price_change > 0:
            return base_holding_reward * 2
        elif price_change < 0:
            return base_holding_reward * 0.5
        return base_holding_reward

    def calculate_basic_reward(self, pnl, confidence):
        """Calculate basic training reward based on P&L and confidence"""
        try:
            base_reward = pnl
            if pnl < 0 and confidence > 0.7:
                confidence_adjustment = -confidence * 2
            elif pnl > 0 and confidence > 0.7:
                confidence_adjustment = confidence * 1.5
            else:
                confidence_adjustment = 0
            final_reward = base_reward + confidence_adjustment
            normalized_reward = np.tanh(final_reward / 10.0)
            logger.debug(f"Basic reward calculation: P&L={pnl:.4f}, confidence={confidence:.2f}, reward={normalized_reward:.4f}")
            return float(normalized_reward)
        except Exception as e:
            logger.error(f"Error calculating basic reward: {e}")
            return 0.0

    def calculate_enhanced_reward(self, action, price_change, position_held_time=0, volatility=None, is_profitable=False, confidence=0.0, predicted_change=0.0, actual_change=0.0, current_pnl=0.0, symbol='UNKNOWN'):
        """Calculate enhanced reward for trading actions with shifted neutral point

        Neutral reward is shifted to require profits that exceed double the fees,
        which penalizes small profit trades and encourages holding for larger moves.
        Current PnL is given more weight in the decision-making process.
        """
        fee = self.base_fee_rate
        double_fee = fee * 4  # Double the fees (2x open + 2x close = 4x base fee)
        frequency_penalty = self._calculate_frequency_penalty()

        if action == 0:  # Buy
            # Penalize buying more when already in profit
            reward = -fee - frequency_penalty
            if current_pnl > 0:
                # Reduce incentive to close profitable positions
                reward -= current_pnl * 0.2
        elif action == 1:  # Sell
            profit_pct = price_change

            # Shift neutral point - require profit > double fees to be considered positive
            net_profit = profit_pct - double_fee

            # Scale reward based on profit size
            if net_profit > 0:
                # Exponential reward for larger profits
                reward = (net_profit ** 1.5) * self.reward_scaling
            else:
                # Linear penalty for losses
                reward = net_profit * self.reward_scaling

            reward -= frequency_penalty
            self.record_pnl(net_profit)

            # Add extra penalty for very small profits (less than 3x fees)
            if 0 < profit_pct < (fee * 6):
                reward -= 0.5  # Discourage tiny profit-taking
        else:  # Hold
            if is_profitable:
                # Increase reward for holding profitable positions
                profit_factor = min(5.0, current_pnl * 20)  # Cap at 5x
                reward = self._calculate_holding_reward(position_held_time, price_change) * (1.0 + profit_factor)

                # Add bonus for holding through volatility when profitable
                if volatility is not None and volatility > 0.001:
                    reward += 0.1 * volatility * 100
            else:
                # Small penalty for holding losing positions
                loss_factor = min(1.0, abs(current_pnl) * 10)
                reward = -0.0001 * (1.0 + loss_factor)

                # But reduce penalty for very recent positions (give them time)
                if position_held_time < 30:  # Less than 30 seconds
                    reward *= 0.5

        # Prediction accuracy reward component
        if action in [0, 1] and predicted_change != 0:
            if (action == 0 and actual_change > 0) or (action == 1 and actual_change < 0):
                reward += abs(actual_change) * 5.0
            else:
                reward -= abs(predicted_change) * 2.0

        # Increase weight of current PnL in decision making (3x more than before)
        reward += current_pnl * 0.3

        # Volatility penalty
        if volatility is not None:
            reward -= abs(volatility) * 100

        # Risk adjustment
        if self.risk_aversion > 0 and len(self.returns) > 1:
            returns_std = np.std(self.returns)
            reward -= returns_std * self.risk_aversion

        self.record_trade(action)
        return reward

    def calculate_prediction_reward(self, symbol, predicted_direction, actual_direction, confidence, predicted_change, actual_change, current_pnl=0.0, position_duration=0.0):
        """Calculate reward for prediction accuracy"""
        reward = 0.0
        if predicted_direction == actual_direction:
            reward += 1.0 * confidence
        else:
            reward -= 0.5
        if predicted_direction == actual_direction and abs(predicted_change) > 0.001:
            reward += abs(actual_change) * 5.0
        if predicted_direction != actual_direction and abs(predicted_change) > 0.001:
            reward -= abs(predicted_change) * 2.0
        reward += current_pnl * 0.1
        # Dynamic adjustment based on recent PnL (loss cutting incentive)
        if hasattr(self, 'pnl_history') and symbol in self.pnl_history and self.pnl_history[symbol]:
            latest_pnl_entry = self.pnl_history[symbol][-1]
            latest_pnl_value = latest_pnl_entry.get('pnl', 0.0) if isinstance(latest_pnl_entry, dict) else 0.0
            if latest_pnl_value < 0 and position_duration > 60:
                reward -= (abs(latest_pnl_value) * 0.2)
            pnl_values = [entry.get('pnl', 0.0) for entry in self.pnl_history[symbol] if isinstance(entry, dict)]
            best_pnl = max(pnl_values) if pnl_values else 0.0
            if best_pnl < 0.0:
                reward -= 0.1
        return reward

# Example usage:
if __name__ == "__main__":
    # Create calculator instance
    reward_calc = RewardCalculator()

    # Example reward for a buy action
    buy_reward = reward_calc.calculate_enhanced_reward(action=0, price_change=0)
    print(f"Buy action reward: {buy_reward:.5f}")

    # Record a trade for frequency tracking
    reward_calc.record_trade(0)

    # Wait a bit and make another trade to test frequency penalty
    import time
    time.sleep(0.1)

    # Example reward for a sell action with profit
    sell_reward = reward_calc.calculate_enhanced_reward(action=1, price_change=0.015, position_held_time=60)
    print(f"Sell action reward (with profit): {sell_reward:.5f}")

    # Example reward for a hold action on profitable position
    hold_reward = reward_calc.calculate_enhanced_reward(action=2, price_change=0.01, position_held_time=30, is_profitable=True)
    print(f"Hold action reward (profitable): {hold_reward:.5f}")

    # Example reward for a hold action on unprofitable position
    hold_reward_neg = reward_calc.calculate_enhanced_reward(action=2, price_change=-0.01, position_held_time=30, is_profitable=False)
    print(f"Hold action reward (unprofitable): {hold_reward_neg:.5f}")