gogo2/utils/reward_calculator.py

"""
Improved Reward Function for RL Trading Agent

This module provides a more sophisticated reward function for the RL trading agent
that incorporates realistic trading fees, penalties for excessive trading, and
rewards for successful holding of positions.
"""

import numpy as np
from datetime import datetime, timedelta
from collections import deque
import logging

logger = logging.getLogger(__name__)

class RewardCalculator:
    def __init__(self, base_fee_rate=0.001, reward_scaling=10.0, risk_aversion=0.1):
        self.base_fee_rate = base_fee_rate
        self.reward_scaling = reward_scaling
        self.risk_aversion = risk_aversion
        self.trade_pnls = []
        self.returns = []
        self.trade_timestamps = []
        self.frequency_threshold = 10  # Trades per minute threshold for penalty
        self.max_frequency_penalty = 0.05

    def record_pnl(self, pnl):
        """Record P&L for risk adjustment calculations"""
        self.trade_pnls.append(pnl)
        if len(self.trade_pnls) > 100:
            self.trade_pnls.pop(0)

    def record_trade(self, action):
        """Record trade action for frequency penalty calculations"""
        from time import time
        self.trade_timestamps.append(time())
        if len(self.trade_timestamps) > 100:
            self.trade_timestamps.pop(0)

    def _calculate_frequency_penalty(self):
        """Calculate penalty for high-frequency trading"""
        if len(self.trade_timestamps) < 2:
            return 0.0
        time_span = self.trade_timestamps[-1] - self.trade_timestamps[0]
        if time_span <= 0:
            return 0.0
        trades_per_minute = (len(self.trade_timestamps) / time_span) * 60
        if trades_per_minute > self.frequency_threshold:
            penalty = min(self.max_frequency_penalty, (trades_per_minute - self.frequency_threshold) * 0.001)
            return penalty
        return 0.0

    def _calculate_risk_adjustment(self, reward):
        """Adjust rewards based on risk (simple Sharpe ratio implementation)"""
        if len(self.trade_pnls) < 5:
            return reward
        pnl_array = np.array(self.trade_pnls)
        mean_return = np.mean(pnl_array)
        std_return = np.std(pnl_array)
        if std_return == 0:
            return reward
        sharpe = mean_return / std_return
        adjustment_factor = np.clip(1.0 + 0.5 * sharpe, 0.5, 2.0)
        return reward * adjustment_factor

    def _calculate_holding_reward(self, position_held_time, price_change):
        """Calculate reward for holding a position"""
        base_holding_reward = 0.0005 * (position_held_time / 60.0)
        if price_change > 0:
            return base_holding_reward * 2
        elif price_change < 0:
            return base_holding_reward * 0.5
        return base_holding_reward

    def calculate_basic_reward(self, pnl, confidence):
        """Calculate basic training reward based on P&L and confidence"""
        try:
            base_reward = pnl
            if pnl < 0 and confidence > 0.7:
                confidence_adjustment = -confidence * 2
            elif pnl > 0 and confidence > 0.7:
                confidence_adjustment = confidence * 1.5
            else:
                confidence_adjustment = 0
            final_reward = base_reward + confidence_adjustment
            normalized_reward = np.tanh(final_reward / 10.0)
            logger.debug(f"Basic reward calculation: P&L={pnl:.4f}, confidence={confidence:.2f}, reward={normalized_reward:.4f}")
            return float(normalized_reward)
        except Exception as e:
            logger.error(f"Error calculating basic reward: {e}")
            return 0.0

    def calculate_enhanced_reward(self, action, price_change, position_held_time=0, volatility=None, is_profitable=False, confidence=0.0, predicted_change=0.0, actual_change=0.0, current_pnl=0.0, symbol='UNKNOWN'):
        """Calculate enhanced reward for trading actions"""
        fee = self.base_fee_rate
        frequency_penalty = self._calculate_frequency_penalty()
        if action == 0:  # Buy
            reward = -fee - frequency_penalty
        elif action == 1:  # Sell
            profit_pct = price_change
            net_profit = profit_pct - (fee * 2)
            reward = net_profit * self.reward_scaling
            reward -= frequency_penalty
            self.record_pnl(net_profit)
        else:  # Hold
            if is_profitable:
                reward = self._calculate_holding_reward(position_held_time, price_change)
            else:
                reward = -0.0001
        if action in [0, 1] and predicted_change != 0:
            if (action == 0 and actual_change > 0) or (action == 1 and actual_change < 0):
                reward += abs(actual_change) * 5.0
            else:
                reward -= abs(predicted_change) * 2.0
        reward += current_pnl * 0.1
        if volatility is not None:
            reward -= abs(volatility) * 100
        if self.risk_aversion > 0 and len(self.returns) > 1:
            returns_std = np.std(self.returns)
            reward -= returns_std * self.risk_aversion
        self.record_trade(action)
        return reward

    def calculate_prediction_reward(self, symbol, predicted_direction, actual_direction, confidence, predicted_change, actual_change, current_pnl=0.0, position_duration=0.0):
        """Calculate reward for prediction accuracy"""
        reward = 0.0
        if predicted_direction == actual_direction:
            reward += 1.0 * confidence
        else:
            reward -= 0.5
        if predicted_direction == actual_direction and abs(predicted_change) > 0.001:
            reward += abs(actual_change) * 5.0
        if predicted_direction != actual_direction and abs(predicted_change) > 0.001:
            reward -= abs(predicted_change) * 2.0
        reward += current_pnl * 0.1
        # Dynamic adjustment based on recent PnL (loss cutting incentive)
        if hasattr(self, 'pnl_history') and symbol in self.pnl_history and self.pnl_history[symbol]:
            latest_pnl_entry = self.pnl_history[symbol][-1]
            latest_pnl_value = latest_pnl_entry.get('pnl', 0.0) if isinstance(latest_pnl_entry, dict) else 0.0
            if latest_pnl_value < 0 and position_duration > 60:
                reward -= (abs(latest_pnl_value) * 0.2)
            pnl_values = [entry.get('pnl', 0.0) for entry in self.pnl_history[symbol] if isinstance(entry, dict)]
            best_pnl = max(pnl_values) if pnl_values else 0.0
            if best_pnl < 0.0:
                reward -= 0.1
        return reward

# Example usage:
if __name__ == "__main__":
    # Create calculator instance
    reward_calc = RewardCalculator()

    # Example reward for a buy action
    buy_reward = reward_calc.calculate_enhanced_reward(action=0, price_change=0)
    print(f"Buy action reward: {buy_reward:.5f}")

    # Record a trade for frequency tracking
    reward_calc.record_trade(0)

    # Wait a bit and make another trade to test frequency penalty
    import time
    time.sleep(0.1)

    # Example reward for a sell action with profit
    sell_reward = reward_calc.calculate_enhanced_reward(action=1, price_change=0.015, position_held_time=60)
    print(f"Sell action reward (with profit): {sell_reward:.5f}")

    # Example reward for a hold action on profitable position
    hold_reward = reward_calc.calculate_enhanced_reward(action=2, price_change=0.01, position_held_time=30, is_profitable=True)
    print(f"Hold action reward (profitable): {hold_reward:.5f}")

    # Example reward for a hold action on unprofitable position
    hold_reward_neg = reward_calc.calculate_enhanced_reward(action=2, price_change=-0.01, position_held_time=30, is_profitable=False)
    print(f"Hold action reward (unprofitable): {hold_reward_neg:.5f}")