cleanup, cob ladder still broken
This commit is contained in:
@ -9,215 +9,166 @@ rewards for successful holding of positions.
|
||||
import numpy as np
|
||||
from datetime import datetime, timedelta
|
||||
from collections import deque
|
||||
import logging
|
||||
|
||||
class ImprovedRewardCalculator:
|
||||
def __init__(self,
|
||||
max_drawdown_pct=0.1, # Maximum drawdown %
|
||||
risk_reward_ratio=1.5, # Risk-reward ratio
|
||||
base_fee_rate=0.0002, # 0.02% per transaction
|
||||
max_frequency_penalty=0.005, # Maximum 0.5% penalty for frequent trading
|
||||
holding_reward_rate=0.0001, # Small reward for holding profitable positions
|
||||
risk_adjusted=True, # Use Sharpe ratio for risk adjustment
|
||||
base_reward=1.0, # Base reward scale
|
||||
profit_factor=2.0, # Profit reward multiplier
|
||||
loss_factor=1.0, # Loss penalty multiplier
|
||||
trade_frequency_penalty=0.3, # Penalty for frequent trading
|
||||
position_duration_factor=0.05 # Reward for longer positions
|
||||
):
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class RewardCalculator:
|
||||
def __init__(self, base_fee_rate=0.001, reward_scaling=10.0, risk_aversion=0.1):
|
||||
self.base_fee_rate = base_fee_rate
|
||||
self.max_frequency_penalty = max_frequency_penalty
|
||||
self.holding_reward_rate = holding_reward_rate
|
||||
self.risk_adjusted = risk_adjusted
|
||||
|
||||
# New parameters
|
||||
self.base_reward = base_reward
|
||||
self.profit_factor = profit_factor
|
||||
self.loss_factor = loss_factor
|
||||
self.trade_frequency_penalty = trade_frequency_penalty
|
||||
self.position_duration_factor = position_duration_factor
|
||||
|
||||
# Keep track of recent trades
|
||||
self.recent_trades = deque(maxlen=1000)
|
||||
self.trade_pnls = deque(maxlen=100) # For risk adjustment
|
||||
|
||||
# Additional tracking metrics
|
||||
self.total_trades = 0
|
||||
self.profitable_trades = 0
|
||||
self.total_pnl = 0.0
|
||||
self.daily_pnl = {}
|
||||
self.hourly_pnl = {}
|
||||
|
||||
def record_trade(self, timestamp=None, action=None, price=None):
|
||||
"""Record a trade for frequency tracking"""
|
||||
if timestamp is None:
|
||||
timestamp = datetime.now()
|
||||
|
||||
self.recent_trades.append({
|
||||
'timestamp': timestamp,
|
||||
'action': action,
|
||||
'price': price
|
||||
})
|
||||
|
||||
self.reward_scaling = reward_scaling
|
||||
self.risk_aversion = risk_aversion
|
||||
self.trade_pnls = []
|
||||
self.returns = []
|
||||
self.trade_timestamps = []
|
||||
self.frequency_threshold = 10 # Trades per minute threshold for penalty
|
||||
self.max_frequency_penalty = 0.05
|
||||
|
||||
def record_pnl(self, pnl):
|
||||
"""Record a PnL result for risk adjustment and tracking metrics"""
|
||||
"""Record P&L for risk adjustment calculations"""
|
||||
self.trade_pnls.append(pnl)
|
||||
|
||||
# Update overall metrics
|
||||
self.total_trades += 1
|
||||
self.total_pnl += pnl
|
||||
|
||||
if pnl > 0:
|
||||
self.profitable_trades += 1
|
||||
|
||||
# Track daily and hourly PnL
|
||||
now = datetime.now()
|
||||
day_key = now.strftime('%Y-%m-%d')
|
||||
hour_key = now.strftime('%Y-%m-%d %H:00')
|
||||
|
||||
# Update daily PnL
|
||||
if day_key not in self.daily_pnl:
|
||||
self.daily_pnl[day_key] = 0.0
|
||||
self.daily_pnl[day_key] += pnl
|
||||
|
||||
# Update hourly PnL
|
||||
if hour_key not in self.hourly_pnl:
|
||||
self.hourly_pnl[hour_key] = 0.0
|
||||
self.hourly_pnl[hour_key] += pnl
|
||||
|
||||
if len(self.trade_pnls) > 100:
|
||||
self.trade_pnls.pop(0)
|
||||
|
||||
def record_trade(self, action):
|
||||
"""Record trade action for frequency penalty calculations"""
|
||||
from time import time
|
||||
self.trade_timestamps.append(time())
|
||||
if len(self.trade_timestamps) > 100:
|
||||
self.trade_timestamps.pop(0)
|
||||
|
||||
def _calculate_frequency_penalty(self):
|
||||
"""Calculate penalty for trading too frequently"""
|
||||
if len(self.recent_trades) < 2:
|
||||
"""Calculate penalty for high-frequency trading"""
|
||||
if len(self.trade_timestamps) < 2:
|
||||
return 0.0
|
||||
|
||||
# Count trades in the last minute
|
||||
now = datetime.now()
|
||||
one_minute_ago = now - timedelta(minutes=1)
|
||||
trades_last_minute = sum(1 for trade in self.recent_trades
|
||||
if trade['timestamp'] > one_minute_ago)
|
||||
|
||||
# Apply progressive penalty (more severe as frequency increases)
|
||||
if trades_last_minute <= 1:
|
||||
return 0.0 # No penalty for normal trading rate
|
||||
|
||||
# Progressive penalty based on trade frequency
|
||||
penalty = min(self.max_frequency_penalty,
|
||||
self.base_fee_rate * trades_last_minute)
|
||||
|
||||
return penalty
|
||||
|
||||
def _calculate_holding_reward(self, position_held_time, price_change_pct):
|
||||
"""Calculate reward for holding a position for some time"""
|
||||
if position_held_time <= 0 or price_change_pct <= 0:
|
||||
return 0.0 # No reward for unprofitable holds
|
||||
|
||||
# Cap at 100 time units (seconds, minutes, etc.)
|
||||
capped_time = min(position_held_time, 100)
|
||||
|
||||
# Scale reward by both time and price change
|
||||
reward = self.holding_reward_rate * capped_time * price_change_pct
|
||||
|
||||
return reward
|
||||
|
||||
time_span = self.trade_timestamps[-1] - self.trade_timestamps[0]
|
||||
if time_span <= 0:
|
||||
return 0.0
|
||||
trades_per_minute = (len(self.trade_timestamps) / time_span) * 60
|
||||
if trades_per_minute > self.frequency_threshold:
|
||||
penalty = min(self.max_frequency_penalty, (trades_per_minute - self.frequency_threshold) * 0.001)
|
||||
return penalty
|
||||
return 0.0
|
||||
|
||||
def _calculate_risk_adjustment(self, reward):
|
||||
"""Adjust rewards based on risk (simple Sharpe ratio implementation)"""
|
||||
if len(self.trade_pnls) < 5:
|
||||
return reward # Not enough data for adjustment
|
||||
|
||||
# Calculate mean and standard deviation of returns
|
||||
return reward
|
||||
pnl_array = np.array(self.trade_pnls)
|
||||
mean_return = np.mean(pnl_array)
|
||||
std_return = np.std(pnl_array)
|
||||
|
||||
if std_return == 0:
|
||||
return reward # Avoid division by zero
|
||||
|
||||
# Simplified Sharpe ratio
|
||||
return reward
|
||||
sharpe = mean_return / std_return
|
||||
|
||||
# Scale reward by Sharpe ratio (normalized to be around 1.0)
|
||||
adjustment_factor = np.clip(1.0 + 0.5 * sharpe, 0.5, 2.0)
|
||||
|
||||
return reward * adjustment_factor
|
||||
|
||||
def calculate_reward(self, action, price_change, position_held_time=0,
|
||||
volatility=None, is_profitable=False):
|
||||
"""
|
||||
Calculate the improved reward
|
||||
|
||||
Args:
|
||||
action (int): 0 = Buy, 1 = Sell, 2 = Hold
|
||||
price_change (float): Percent price change for the trade
|
||||
position_held_time (int): Time position was held (in time units)
|
||||
volatility (float, optional): Market volatility measure
|
||||
is_profitable (bool): Whether current position is profitable
|
||||
|
||||
Returns:
|
||||
float: Calculated reward value
|
||||
"""
|
||||
# Calculate trading fee
|
||||
|
||||
def _calculate_holding_reward(self, position_held_time, price_change):
|
||||
"""Calculate reward for holding a position"""
|
||||
base_holding_reward = 0.0005 * (position_held_time / 60.0)
|
||||
if price_change > 0:
|
||||
return base_holding_reward * 2
|
||||
elif price_change < 0:
|
||||
return base_holding_reward * 0.5
|
||||
return base_holding_reward
|
||||
|
||||
def calculate_basic_reward(self, pnl, confidence):
|
||||
"""Calculate basic training reward based on P&L and confidence"""
|
||||
try:
|
||||
base_reward = pnl
|
||||
if pnl < 0 and confidence > 0.7:
|
||||
confidence_adjustment = -confidence * 2
|
||||
elif pnl > 0 and confidence > 0.7:
|
||||
confidence_adjustment = confidence * 1.5
|
||||
else:
|
||||
confidence_adjustment = 0
|
||||
final_reward = base_reward + confidence_adjustment
|
||||
normalized_reward = np.tanh(final_reward / 10.0)
|
||||
logger.debug(f"Basic reward calculation: P&L={pnl:.4f}, confidence={confidence:.2f}, reward={normalized_reward:.4f}")
|
||||
return float(normalized_reward)
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating basic reward: {e}")
|
||||
return 0.0
|
||||
|
||||
def calculate_enhanced_reward(self, action, price_change, position_held_time=0, volatility=None, is_profitable=False, confidence=0.0, predicted_change=0.0, actual_change=0.0, current_pnl=0.0, symbol='UNKNOWN'):
|
||||
"""Calculate enhanced reward for trading actions"""
|
||||
fee = self.base_fee_rate
|
||||
|
||||
# Calculate frequency penalty
|
||||
frequency_penalty = self._calculate_frequency_penalty()
|
||||
|
||||
# Base reward calculation
|
||||
if action == 0: # Buy
|
||||
# Small penalty for transaction plus frequency penalty
|
||||
reward = -fee - frequency_penalty
|
||||
|
||||
elif action == 1: # Sell
|
||||
# Calculate profit percentage minus fees (both entry and exit)
|
||||
profit_pct = price_change
|
||||
net_profit = profit_pct - (fee * 2)
|
||||
|
||||
# Scale reward and apply frequency penalty
|
||||
reward = net_profit * 10 # Scale reward
|
||||
reward = net_profit * self.reward_scaling
|
||||
reward -= frequency_penalty
|
||||
|
||||
# Record PnL for risk adjustment
|
||||
self.record_pnl(net_profit)
|
||||
|
||||
else: # Hold
|
||||
# Small reward for holding a profitable position, small cost otherwise
|
||||
if is_profitable:
|
||||
reward = self._calculate_holding_reward(position_held_time, price_change)
|
||||
else:
|
||||
reward = -0.0001 # Very small negative reward
|
||||
|
||||
# Apply risk adjustment if enabled
|
||||
if self.risk_adjusted:
|
||||
reward = self._calculate_risk_adjustment(reward)
|
||||
|
||||
# Record this action for future frequency calculations
|
||||
self.record_trade(action=action)
|
||||
|
||||
reward = -0.0001
|
||||
if action in [0, 1] and predicted_change != 0:
|
||||
if (action == 0 and actual_change > 0) or (action == 1 and actual_change < 0):
|
||||
reward += abs(actual_change) * 5.0
|
||||
else:
|
||||
reward -= abs(predicted_change) * 2.0
|
||||
reward += current_pnl * 0.1
|
||||
if volatility is not None:
|
||||
reward -= abs(volatility) * 100
|
||||
if self.risk_aversion > 0 and len(self.returns) > 1:
|
||||
returns_std = np.std(self.returns)
|
||||
reward -= returns_std * self.risk_aversion
|
||||
self.record_trade(action)
|
||||
return reward
|
||||
|
||||
def calculate_prediction_reward(self, symbol, predicted_direction, actual_direction, confidence, predicted_change, actual_change, current_pnl=0.0, position_duration=0.0):
|
||||
"""Calculate reward for prediction accuracy"""
|
||||
reward = 0.0
|
||||
if predicted_direction == actual_direction:
|
||||
reward += 1.0 * confidence
|
||||
else:
|
||||
reward -= 0.5
|
||||
if predicted_direction == actual_direction and abs(predicted_change) > 0.001:
|
||||
reward += abs(actual_change) * 5.0
|
||||
if predicted_direction != actual_direction and abs(predicted_change) > 0.001:
|
||||
reward -= abs(predicted_change) * 2.0
|
||||
reward += current_pnl * 0.1
|
||||
# Dynamic adjustment based on recent PnL (loss cutting incentive)
|
||||
if hasattr(self, 'pnl_history') and symbol in self.pnl_history and self.pnl_history[symbol]:
|
||||
latest_pnl_entry = self.pnl_history[symbol][-1]
|
||||
latest_pnl_value = latest_pnl_entry.get('pnl', 0.0) if isinstance(latest_pnl_entry, dict) else 0.0
|
||||
if latest_pnl_value < 0 and position_duration > 60:
|
||||
reward -= (abs(latest_pnl_value) * 0.2)
|
||||
pnl_values = [entry.get('pnl', 0.0) for entry in self.pnl_history[symbol] if isinstance(entry, dict)]
|
||||
best_pnl = max(pnl_values) if pnl_values else 0.0
|
||||
if best_pnl < 0.0:
|
||||
reward -= 0.1
|
||||
return reward
|
||||
|
||||
# Example usage:
|
||||
if __name__ == "__main__":
|
||||
# Create calculator instance
|
||||
reward_calc = ImprovedRewardCalculator()
|
||||
reward_calc = RewardCalculator()
|
||||
|
||||
# Example reward for a buy action
|
||||
buy_reward = reward_calc.calculate_reward(action=0, price_change=0)
|
||||
buy_reward = reward_calc.calculate_enhanced_reward(action=0, price_change=0)
|
||||
print(f"Buy action reward: {buy_reward:.5f}")
|
||||
|
||||
# Record a trade for frequency tracking
|
||||
reward_calc.record_trade(action=0)
|
||||
reward_calc.record_trade(0)
|
||||
|
||||
# Wait a bit and make another trade to test frequency penalty
|
||||
import time
|
||||
time.sleep(0.1)
|
||||
|
||||
# Example reward for a sell action with profit
|
||||
sell_reward = reward_calc.calculate_reward(action=1, price_change=0.015, position_held_time=60)
|
||||
sell_reward = reward_calc.calculate_enhanced_reward(action=1, price_change=0.015, position_held_time=60)
|
||||
print(f"Sell action reward (with profit): {sell_reward:.5f}")
|
||||
|
||||
# Example reward for a hold action on profitable position
|
||||
hold_reward = reward_calc.calculate_reward(action=2, price_change=0.01, position_held_time=30, is_profitable=True)
|
||||
hold_reward = reward_calc.calculate_enhanced_reward(action=2, price_change=0.01, position_held_time=30, is_profitable=True)
|
||||
print(f"Hold action reward (profitable): {hold_reward:.5f}")
|
||||
|
||||
# Example reward for a hold action on unprofitable position
|
||||
hold_reward_neg = reward_calc.calculate_reward(action=2, price_change=-0.01, position_held_time=30, is_profitable=False)
|
||||
hold_reward_neg = reward_calc.calculate_enhanced_reward(action=2, price_change=-0.01, position_held_time=30, is_profitable=False)
|
||||
print(f"Hold action reward (unprofitable): {hold_reward_neg:.5f}")
|
Reference in New Issue
Block a user