new training process and changes to the models (wip)
This commit is contained in:
@ -3,6 +3,10 @@ import gym
|
||||
from gym import spaces
|
||||
from typing import Dict, Tuple, List
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
||||
# Configure logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class TradingEnvironment(gym.Env):
|
||||
"""
|
||||
@ -12,97 +16,284 @@ class TradingEnvironment(gym.Env):
|
||||
data: pd.DataFrame,
|
||||
initial_balance: float = 100.0,
|
||||
fee_rate: float = 0.0002,
|
||||
max_steps: int = 1000):
|
||||
max_steps: int = 1000,
|
||||
window_size: int = 20,
|
||||
risk_aversion: float = 0.2, # Controls how much to penalize volatility
|
||||
price_scaling: str = 'zscore', # 'zscore', 'minmax', or 'raw'
|
||||
reward_scaling: float = 10.0, # Scale factor for rewards
|
||||
episode_penalty: float = 0.1): # Penalty for active positions at end of episode
|
||||
super(TradingEnvironment, self).__init__()
|
||||
|
||||
self.data = data
|
||||
self.initial_balance = initial_balance
|
||||
self.fee_rate = fee_rate
|
||||
self.max_steps = max_steps
|
||||
self.window_size = window_size
|
||||
self.risk_aversion = risk_aversion
|
||||
self.price_scaling = price_scaling
|
||||
self.reward_scaling = reward_scaling
|
||||
self.episode_penalty = episode_penalty
|
||||
|
||||
# Action space: 0 (SELL), 1 (HOLD), 2 (BUY)
|
||||
# Preprocess data if needed
|
||||
self._preprocess_data()
|
||||
|
||||
# Action space: 0 (BUY), 1 (SELL), 2 (HOLD)
|
||||
self.action_space = spaces.Discrete(3)
|
||||
|
||||
# Observation space: price data, technical indicators, and account state
|
||||
feature_dim = self.data.shape[1] + 3 # Adding position, equity, unrealized_pnl
|
||||
self.observation_space = spaces.Box(
|
||||
low=-np.inf,
|
||||
high=np.inf,
|
||||
shape=(data.shape[1],), # Number of features
|
||||
shape=(feature_dim,),
|
||||
dtype=np.float32
|
||||
)
|
||||
|
||||
# Initialize state
|
||||
self.reset()
|
||||
|
||||
def _preprocess_data(self):
|
||||
"""Preprocess data - normalize or standardize features"""
|
||||
# Store the original data for reference
|
||||
self.original_data = self.data.copy()
|
||||
|
||||
# Normalize price data based on the selected method
|
||||
if self.price_scaling == 'zscore':
|
||||
# For each feature, apply z-score normalization
|
||||
for col in self.data.columns:
|
||||
if col in ['open', 'high', 'low', 'close']:
|
||||
mean = self.data[col].mean()
|
||||
std = self.data[col].std()
|
||||
if std > 0:
|
||||
self.data[col] = (self.data[col] - mean) / std
|
||||
# Normalize volume separately
|
||||
elif col == 'volume':
|
||||
mean = self.data[col].mean()
|
||||
std = self.data[col].std()
|
||||
if std > 0:
|
||||
self.data[col] = (self.data[col] - mean) / std
|
||||
|
||||
elif self.price_scaling == 'minmax':
|
||||
# For each feature, apply min-max scaling
|
||||
for col in self.data.columns:
|
||||
min_val = self.data[col].min()
|
||||
max_val = self.data[col].max()
|
||||
if max_val > min_val:
|
||||
self.data[col] = (self.data[col] - min_val) / (max_val - min_val)
|
||||
|
||||
def reset(self) -> np.ndarray:
|
||||
"""Reset the environment to initial state"""
|
||||
self.current_step = 0
|
||||
self.current_step = self.window_size
|
||||
self.balance = self.initial_balance
|
||||
self.position = 0 # 0: no position, 1: long position
|
||||
self.position = 0 # 0: no position, 1: long position, -1: short position
|
||||
self.entry_price = 0
|
||||
self.entry_time = 0
|
||||
self.total_trades = 0
|
||||
self.winning_trades = 0
|
||||
self.losing_trades = 0
|
||||
self.total_pnl = 0
|
||||
self.balance_history = [self.initial_balance]
|
||||
self.equity_history = [self.initial_balance]
|
||||
self.max_balance = self.initial_balance
|
||||
self.max_drawdown = 0
|
||||
|
||||
# Trading performance metrics
|
||||
self.trade_durations = [] # Track how long trades are held
|
||||
self.returns = [] # Track returns of each trade
|
||||
|
||||
# For analyzing trade clustering
|
||||
self.last_action_time = 0
|
||||
self.actions_taken = []
|
||||
|
||||
return self._get_observation()
|
||||
|
||||
def _get_observation(self) -> np.ndarray:
|
||||
"""Get current observation state"""
|
||||
return self.data.iloc[self.current_step].values
|
||||
"""Get current observation state with account information"""
|
||||
# Get market data for the current step
|
||||
market_data = self.data.iloc[self.current_step].values
|
||||
|
||||
# Get current price
|
||||
current_price = self.original_data.iloc[self.current_step]['close']
|
||||
|
||||
# Calculate unrealized PnL
|
||||
unrealized_pnl = 0
|
||||
if self.position != 0:
|
||||
price_diff = current_price - self.entry_price
|
||||
unrealized_pnl = self.position * price_diff
|
||||
|
||||
# Calculate total equity (balance + unrealized PnL)
|
||||
equity = self.balance + unrealized_pnl
|
||||
|
||||
# Normalize account state
|
||||
normalized_position = self.position # -1, 0, or 1
|
||||
normalized_equity = equity / self.initial_balance - 1.0 # Percent change from initial
|
||||
normalized_unrealized_pnl = unrealized_pnl / self.initial_balance if self.initial_balance > 0 else 0
|
||||
|
||||
# Combine market data with account state
|
||||
account_state = np.array([normalized_position, normalized_equity, normalized_unrealized_pnl])
|
||||
observation = np.concatenate([market_data, account_state])
|
||||
|
||||
# Handle any NaN values
|
||||
observation = np.nan_to_num(observation, nan=0.0)
|
||||
|
||||
return observation
|
||||
|
||||
def _calculate_reward(self, action: int) -> float:
|
||||
"""Calculate reward based on action and outcome"""
|
||||
current_price = self.data.iloc[self.current_step]['close']
|
||||
"""
|
||||
Calculate reward based on action and outcome with improved risk-adjusted metrics
|
||||
|
||||
# If we have an open position
|
||||
if self.position != 0:
|
||||
# Calculate PnL
|
||||
pnl = self.position * (current_price - self.entry_price) / self.entry_price
|
||||
fees = self.fee_rate * 2 # Entry and exit fees
|
||||
Args:
|
||||
action: The action taken (0=BUY, 1=SELL, 2=HOLD)
|
||||
|
||||
# Close position
|
||||
if (action == 0 and self.position > 0) or (action == 2 and self.position < 0):
|
||||
net_pnl = pnl - fees
|
||||
self.total_pnl += net_pnl
|
||||
self.balance *= (1 + net_pnl)
|
||||
Returns:
|
||||
float: Calculated reward value
|
||||
"""
|
||||
# Get current price and next price
|
||||
current_price = self.original_data.iloc[self.current_step]['close']
|
||||
|
||||
# Default reward is slightly negative to discourage excessive trading
|
||||
reward = -0.0001
|
||||
pnl = 0.0
|
||||
|
||||
# Handle different actions based on current position
|
||||
if self.position == 0: # No position
|
||||
if action == 0: # BUY
|
||||
self.position = 1
|
||||
self.entry_price = current_price
|
||||
self.entry_time = self.current_step
|
||||
reward = -self.fee_rate # Small penalty for trading cost
|
||||
|
||||
elif action == 1: # SELL (start short position)
|
||||
self.position = -1
|
||||
self.entry_price = current_price
|
||||
self.entry_time = self.current_step
|
||||
reward = -self.fee_rate # Small penalty for trading cost
|
||||
|
||||
# else action == 2 (HOLD) - keep the small negative reward
|
||||
|
||||
elif self.position > 0: # Long position
|
||||
if action == 1: # SELL (close long)
|
||||
# Calculate profit/loss
|
||||
price_diff = current_price - self.entry_price
|
||||
pnl = price_diff / self.entry_price - 2 * self.fee_rate # Account for entry and exit fees
|
||||
|
||||
# Adjust reward based on PnL and risk
|
||||
reward = pnl * self.reward_scaling
|
||||
|
||||
# Track trade performance
|
||||
self.total_trades += 1
|
||||
if pnl > 0:
|
||||
self.winning_trades += 1
|
||||
else:
|
||||
self.losing_trades += 1
|
||||
|
||||
# Calculate trade duration
|
||||
trade_duration = self.current_step - self.entry_time
|
||||
self.trade_durations.append(trade_duration)
|
||||
|
||||
# Update returns list
|
||||
self.returns.append(pnl)
|
||||
|
||||
# Update balance and reset position
|
||||
self.balance *= (1 + pnl)
|
||||
self.balance_history.append(self.balance)
|
||||
self.max_balance = max(self.max_balance, self.balance)
|
||||
self.total_pnl += pnl
|
||||
|
||||
self.total_trades += 1
|
||||
if net_pnl > 0:
|
||||
self.winning_trades += 1
|
||||
|
||||
# Reward based on PnL
|
||||
reward = net_pnl * 100 # Scale up for better learning
|
||||
|
||||
# Additional reward for win rate
|
||||
win_rate = self.winning_trades / max(1, self.total_trades)
|
||||
reward += win_rate * 0.1
|
||||
|
||||
# Reset position
|
||||
self.position = 0
|
||||
return reward
|
||||
|
||||
elif action == 0: # BUY (while already long)
|
||||
# Penalize trying to increase an already active position
|
||||
reward = -0.001
|
||||
|
||||
# else action == 2 (HOLD) - calculate unrealized P&L for reward
|
||||
else:
|
||||
price_diff = current_price - self.entry_price
|
||||
unrealized_pnl = price_diff / self.entry_price
|
||||
|
||||
# Small reward/penalty based on unrealized P&L
|
||||
reward = unrealized_pnl * 0.05 # Scale down to encourage holding good positions
|
||||
|
||||
elif self.position < 0: # Short position
|
||||
if action == 0: # BUY (close short)
|
||||
# Calculate profit/loss
|
||||
price_diff = self.entry_price - current_price
|
||||
pnl = price_diff / self.entry_price - 2 * self.fee_rate # Account for entry and exit fees
|
||||
|
||||
# Adjust reward based on PnL and risk
|
||||
reward = pnl * self.reward_scaling
|
||||
|
||||
# Track trade performance
|
||||
self.total_trades += 1
|
||||
if pnl > 0:
|
||||
self.winning_trades += 1
|
||||
else:
|
||||
self.losing_trades += 1
|
||||
|
||||
# Calculate trade duration
|
||||
trade_duration = self.current_step - self.entry_time
|
||||
self.trade_durations.append(trade_duration)
|
||||
|
||||
# Update returns list
|
||||
self.returns.append(pnl)
|
||||
|
||||
# Update balance and reset position
|
||||
self.balance *= (1 + pnl)
|
||||
self.balance_history.append(self.balance)
|
||||
self.max_balance = max(self.max_balance, self.balance)
|
||||
self.total_pnl += pnl
|
||||
|
||||
# Reset position
|
||||
self.position = 0
|
||||
|
||||
elif action == 1: # SELL (while already short)
|
||||
# Penalize trying to increase an already active position
|
||||
reward = -0.001
|
||||
|
||||
# else action == 2 (HOLD) - calculate unrealized P&L for reward
|
||||
else:
|
||||
price_diff = self.entry_price - current_price
|
||||
unrealized_pnl = price_diff / self.entry_price
|
||||
|
||||
# Small reward/penalty based on unrealized P&L
|
||||
reward = unrealized_pnl * 0.05 # Scale down to encourage holding good positions
|
||||
|
||||
# Record the action
|
||||
self.actions_taken.append(action)
|
||||
self.last_action_time = self.current_step
|
||||
|
||||
# Update equity history (balance + unrealized P&L)
|
||||
current_equity = self.balance
|
||||
if self.position != 0:
|
||||
# Calculate unrealized P&L
|
||||
if self.position > 0: # Long
|
||||
price_diff = current_price - self.entry_price
|
||||
unrealized_pnl = price_diff / self.entry_price * self.balance
|
||||
else: # Short
|
||||
price_diff = self.entry_price - current_price
|
||||
unrealized_pnl = price_diff / self.entry_price * self.balance
|
||||
|
||||
current_equity = self.balance + unrealized_pnl
|
||||
|
||||
# Hold position
|
||||
return pnl * 0.1 # Small reward for holding profitable positions
|
||||
self.equity_history.append(current_equity)
|
||||
|
||||
# No position
|
||||
if action == 1: # HOLD
|
||||
return 0
|
||||
# Calculate current drawdown
|
||||
peak_equity = max(self.equity_history)
|
||||
current_drawdown = (peak_equity - current_equity) / peak_equity if peak_equity > 0 else 0
|
||||
self.max_drawdown = max(self.max_drawdown, current_drawdown)
|
||||
|
||||
# Open new position
|
||||
if action in [0, 2]: # SELL or BUY
|
||||
self.position = -1 if action == 0 else 1
|
||||
self.entry_price = current_price
|
||||
return -self.fee_rate # Small penalty for trading
|
||||
# Apply risk aversion factor - penalize volatility
|
||||
if len(self.returns) > 1:
|
||||
returns_std = np.std(self.returns)
|
||||
reward -= returns_std * self.risk_aversion
|
||||
|
||||
return 0
|
||||
return reward, pnl
|
||||
|
||||
def step(self, action: int) -> Tuple[np.ndarray, float, bool, Dict]:
|
||||
"""Execute one step in the environment"""
|
||||
# Calculate reward
|
||||
reward = self._calculate_reward(action)
|
||||
# Calculate reward and update state
|
||||
reward, pnl = self._calculate_reward(action)
|
||||
|
||||
# Move to next step
|
||||
self.current_step += 1
|
||||
@ -110,26 +301,70 @@ class TradingEnvironment(gym.Env):
|
||||
# Check if episode is done
|
||||
done = self.current_step >= min(self.max_steps - 1, len(self.data) - 1)
|
||||
|
||||
# Apply penalty if episode ends with open position
|
||||
if done and self.position != 0:
|
||||
reward -= self.episode_penalty
|
||||
|
||||
# Force close the position at the end if still open
|
||||
current_price = self.original_data.iloc[self.current_step]['close']
|
||||
if self.position > 0: # Long position
|
||||
price_diff = current_price - self.entry_price
|
||||
pnl = price_diff / self.entry_price - 2 * self.fee_rate
|
||||
else: # Short position
|
||||
price_diff = self.entry_price - current_price
|
||||
pnl = price_diff / self.entry_price - 2 * self.fee_rate
|
||||
|
||||
# Update balance
|
||||
self.balance *= (1 + pnl)
|
||||
self.total_pnl += pnl
|
||||
|
||||
# Track trade
|
||||
self.total_trades += 1
|
||||
if pnl > 0:
|
||||
self.winning_trades += 1
|
||||
else:
|
||||
self.losing_trades += 1
|
||||
|
||||
# Reset position
|
||||
self.position = 0
|
||||
|
||||
# Get next observation
|
||||
observation = self._get_observation()
|
||||
|
||||
# Calculate max drawdown
|
||||
max_drawdown = 0
|
||||
if len(self.balance_history) > 1:
|
||||
peak = self.balance_history[0]
|
||||
for balance in self.balance_history:
|
||||
peak = max(peak, balance)
|
||||
drawdown = (peak - balance) / peak
|
||||
max_drawdown = max(max_drawdown, drawdown)
|
||||
# Calculate sharpe ratio and sortino ratio if possible
|
||||
sharpe_ratio = 0
|
||||
sortino_ratio = 0
|
||||
win_rate = self.winning_trades / max(1, self.total_trades)
|
||||
|
||||
if len(self.returns) > 1:
|
||||
mean_return = np.mean(self.returns)
|
||||
std_return = np.std(self.returns)
|
||||
if std_return > 0:
|
||||
sharpe_ratio = mean_return / std_return
|
||||
|
||||
# For sortino, we only consider downside deviation
|
||||
downside_returns = [r for r in self.returns if r < 0]
|
||||
if downside_returns:
|
||||
downside_deviation = np.std(downside_returns)
|
||||
if downside_deviation > 0:
|
||||
sortino_ratio = mean_return / downside_deviation
|
||||
|
||||
# Calculate average trade duration
|
||||
avg_trade_duration = np.mean(self.trade_durations) if self.trade_durations else 0
|
||||
|
||||
# Additional info
|
||||
info = {
|
||||
'balance': self.balance,
|
||||
'position': self.position,
|
||||
'total_trades': self.total_trades,
|
||||
'win_rate': self.winning_trades / max(1, self.total_trades),
|
||||
'win_rate': win_rate,
|
||||
'total_pnl': self.total_pnl,
|
||||
'max_drawdown': max_drawdown
|
||||
'max_drawdown': self.max_drawdown,
|
||||
'sharpe_ratio': sharpe_ratio,
|
||||
'sortino_ratio': sortino_ratio,
|
||||
'avg_trade_duration': avg_trade_duration,
|
||||
'pnl': pnl,
|
||||
'gain': (self.balance - self.initial_balance) / self.initial_balance
|
||||
}
|
||||
|
||||
return observation, reward, done, info
|
||||
@ -143,20 +378,19 @@ class TradingEnvironment(gym.Env):
|
||||
print(f"Total Trades: {self.total_trades}")
|
||||
print(f"Win Rate: {self.winning_trades/max(1, self.total_trades):.2%}")
|
||||
print(f"Total PnL: ${self.total_pnl:.2f}")
|
||||
print(f"Max Drawdown: {self._calculate_max_drawdown():.2%}")
|
||||
print(f"Max Drawdown: {self.max_drawdown:.2%}")
|
||||
print(f"Sharpe Ratio: {self._calculate_sharpe_ratio():.4f}")
|
||||
print("-" * 50)
|
||||
|
||||
def _calculate_max_drawdown(self):
|
||||
"""Calculate maximum drawdown from balance history"""
|
||||
if len(self.balance_history) <= 1:
|
||||
|
||||
def _calculate_sharpe_ratio(self):
|
||||
"""Calculate Sharpe ratio from returns"""
|
||||
if len(self.returns) < 2:
|
||||
return 0.0
|
||||
|
||||
peak = self.balance_history[0]
|
||||
max_drawdown = 0.0
|
||||
mean_return = np.mean(self.returns)
|
||||
std_return = np.std(self.returns)
|
||||
|
||||
for balance in self.balance_history:
|
||||
peak = max(peak, balance)
|
||||
drawdown = (peak - balance) / peak
|
||||
max_drawdown = max(max_drawdown, drawdown)
|
||||
if std_return == 0:
|
||||
return 0.0
|
||||
|
||||
return max_drawdown
|
||||
return mean_return / std_return
|
Reference in New Issue
Block a user