Files
gogo2/utils/reward_calculator.py
2025-07-17 19:22:35 +03:00

220 lines
9.7 KiB
Python

"""
Improved Reward Function for RL Trading Agent
This module provides a more sophisticated reward function for the RL trading agent
that incorporates realistic trading fees, penalties for excessive trading, and
rewards for successful holding of positions.
"""
import numpy as np
from datetime import datetime, timedelta
from collections import deque
import logging
logger = logging.getLogger(__name__)
class RewardCalculator:
def __init__(self, base_fee_rate=0.001, reward_scaling=10.0, risk_aversion=0.1):
self.base_fee_rate = base_fee_rate
self.reward_scaling = reward_scaling
self.risk_aversion = risk_aversion
self.trade_pnls = []
self.returns = []
self.trade_timestamps = []
self.frequency_threshold = 10 # Trades per minute threshold for penalty
self.max_frequency_penalty = 0.05
def record_pnl(self, pnl):
"""Record P&L for risk adjustment calculations"""
self.trade_pnls.append(pnl)
if len(self.trade_pnls) > 100:
self.trade_pnls.pop(0)
def record_trade(self, action):
"""Record trade action for frequency penalty calculations"""
from time import time
self.trade_timestamps.append(time())
if len(self.trade_timestamps) > 100:
self.trade_timestamps.pop(0)
def _calculate_frequency_penalty(self):
"""Calculate penalty for high-frequency trading"""
if len(self.trade_timestamps) < 2:
return 0.0
time_span = self.trade_timestamps[-1] - self.trade_timestamps[0]
if time_span <= 0:
return 0.0
trades_per_minute = (len(self.trade_timestamps) / time_span) * 60
if trades_per_minute > self.frequency_threshold:
penalty = min(self.max_frequency_penalty, (trades_per_minute - self.frequency_threshold) * 0.001)
return penalty
return 0.0
def _calculate_risk_adjustment(self, reward):
"""Adjust rewards based on risk (simple Sharpe ratio implementation)"""
if len(self.trade_pnls) < 5:
return reward
pnl_array = np.array(self.trade_pnls)
mean_return = np.mean(pnl_array)
std_return = np.std(pnl_array)
if std_return == 0:
return reward
sharpe = mean_return / std_return
adjustment_factor = np.clip(1.0 + 0.5 * sharpe, 0.5, 2.0)
return reward * adjustment_factor
def _calculate_holding_reward(self, position_held_time, price_change):
"""Calculate reward for holding a position"""
base_holding_reward = 0.0005 * (position_held_time / 60.0)
if price_change > 0:
return base_holding_reward * 2
elif price_change < 0:
return base_holding_reward * 0.5
return base_holding_reward
def calculate_basic_reward(self, pnl, confidence):
"""Calculate basic training reward based on P&L and confidence"""
try:
base_reward = pnl
if pnl < 0 and confidence > 0.7:
confidence_adjustment = -confidence * 2
elif pnl > 0 and confidence > 0.7:
confidence_adjustment = confidence * 1.5
else:
confidence_adjustment = 0
final_reward = base_reward + confidence_adjustment
normalized_reward = np.tanh(final_reward / 10.0)
logger.debug(f"Basic reward calculation: P&L={pnl:.4f}, confidence={confidence:.2f}, reward={normalized_reward:.4f}")
return float(normalized_reward)
except Exception as e:
logger.error(f"Error calculating basic reward: {e}")
return 0.0
def calculate_enhanced_reward(self, action, price_change, position_held_time=0, volatility=None, is_profitable=False, confidence=0.0, predicted_change=0.0, actual_change=0.0, current_pnl=0.0, symbol='UNKNOWN'):
"""Calculate enhanced reward for trading actions with shifted neutral point
Neutral reward is shifted to require profits that exceed double the fees,
which penalizes small profit trades and encourages holding for larger moves.
Current PnL is given more weight in the decision-making process.
"""
fee = self.base_fee_rate
double_fee = fee * 4 # Double the fees (2x open + 2x close = 4x base fee)
frequency_penalty = self._calculate_frequency_penalty()
if action == 0: # Buy
# Penalize buying more when already in profit
reward = -fee - frequency_penalty
if current_pnl > 0:
# Reduce incentive to close profitable positions
reward -= current_pnl * 0.2
elif action == 1: # Sell
profit_pct = price_change
# Shift neutral point - require profit > double fees to be considered positive
net_profit = profit_pct - double_fee
# Scale reward based on profit size
if net_profit > 0:
# Exponential reward for larger profits
reward = (net_profit ** 1.5) * self.reward_scaling
else:
# Linear penalty for losses
reward = net_profit * self.reward_scaling
reward -= frequency_penalty
self.record_pnl(net_profit)
# Add extra penalty for very small profits (less than 3x fees)
if 0 < profit_pct < (fee * 6):
reward -= 0.5 # Discourage tiny profit-taking
else: # Hold
if is_profitable:
# Increase reward for holding profitable positions
profit_factor = min(5.0, current_pnl * 20) # Cap at 5x
reward = self._calculate_holding_reward(position_held_time, price_change) * (1.0 + profit_factor)
# Add bonus for holding through volatility when profitable
if volatility is not None and volatility > 0.001:
reward += 0.1 * volatility * 100
else:
# Small penalty for holding losing positions
loss_factor = min(1.0, abs(current_pnl) * 10)
reward = -0.0001 * (1.0 + loss_factor)
# But reduce penalty for very recent positions (give them time)
if position_held_time < 30: # Less than 30 seconds
reward *= 0.5
# Prediction accuracy reward component
if action in [0, 1] and predicted_change != 0:
if (action == 0 and actual_change > 0) or (action == 1 and actual_change < 0):
reward += abs(actual_change) * 5.0
else:
reward -= abs(predicted_change) * 2.0
# Increase weight of current PnL in decision making (3x more than before)
reward += current_pnl * 0.3
# Volatility penalty
if volatility is not None:
reward -= abs(volatility) * 100
# Risk adjustment
if self.risk_aversion > 0 and len(self.returns) > 1:
returns_std = np.std(self.returns)
reward -= returns_std * self.risk_aversion
self.record_trade(action)
return reward
def calculate_prediction_reward(self, symbol, predicted_direction, actual_direction, confidence, predicted_change, actual_change, current_pnl=0.0, position_duration=0.0):
"""Calculate reward for prediction accuracy"""
reward = 0.0
if predicted_direction == actual_direction:
reward += 1.0 * confidence
else:
reward -= 0.5
if predicted_direction == actual_direction and abs(predicted_change) > 0.001:
reward += abs(actual_change) * 5.0
if predicted_direction != actual_direction and abs(predicted_change) > 0.001:
reward -= abs(predicted_change) * 2.0
reward += current_pnl * 0.1
# Dynamic adjustment based on recent PnL (loss cutting incentive)
if hasattr(self, 'pnl_history') and symbol in self.pnl_history and self.pnl_history[symbol]:
latest_pnl_entry = self.pnl_history[symbol][-1]
latest_pnl_value = latest_pnl_entry.get('pnl', 0.0) if isinstance(latest_pnl_entry, dict) else 0.0
if latest_pnl_value < 0 and position_duration > 60:
reward -= (abs(latest_pnl_value) * 0.2)
pnl_values = [entry.get('pnl', 0.0) for entry in self.pnl_history[symbol] if isinstance(entry, dict)]
best_pnl = max(pnl_values) if pnl_values else 0.0
if best_pnl < 0.0:
reward -= 0.1
return reward
# Example usage:
if __name__ == "__main__":
# Create calculator instance
reward_calc = RewardCalculator()
# Example reward for a buy action
buy_reward = reward_calc.calculate_enhanced_reward(action=0, price_change=0)
print(f"Buy action reward: {buy_reward:.5f}")
# Record a trade for frequency tracking
reward_calc.record_trade(0)
# Wait a bit and make another trade to test frequency penalty
import time
time.sleep(0.1)
# Example reward for a sell action with profit
sell_reward = reward_calc.calculate_enhanced_reward(action=1, price_change=0.015, position_held_time=60)
print(f"Sell action reward (with profit): {sell_reward:.5f}")
# Example reward for a hold action on profitable position
hold_reward = reward_calc.calculate_enhanced_reward(action=2, price_change=0.01, position_held_time=30, is_profitable=True)
print(f"Hold action reward (profitable): {hold_reward:.5f}")
# Example reward for a hold action on unprofitable position
hold_reward_neg = reward_calc.calculate_enhanced_reward(action=2, price_change=-0.01, position_held_time=30, is_profitable=False)
print(f"Hold action reward (unprofitable): {hold_reward_neg:.5f}")