RL training in NN working

This commit is contained in:
Dobromir Popov 2025-03-31 11:12:45 +03:00
parent 4eac14022c
commit 0ad9484d56
3 changed files with 449 additions and 167 deletions

View File

@ -13,10 +13,9 @@ logger = logging.getLogger(__name__)
class CNNModelPyTorch(nn.Module):
"""
CNN model for trading signals
Simplified version for RL training
CNN model for trading with multiple timeframes
"""
def __init__(self, window_size: int, num_features: int, output_size: int, timeframes: List[str]):
def __init__(self, window_size, num_features, output_size, timeframes):
super(CNNModelPyTorch, self).__init__()
self.window_size = window_size
@ -24,12 +23,33 @@ class CNNModelPyTorch(nn.Module):
self.output_size = output_size
self.timeframes = timeframes
# Calculate total input features across all timeframes
self.total_features = num_features * len(timeframes)
# Device configuration
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"Using device: {self.device}")
# Build model
self.build_model()
# Convolutional layers
self.conv1 = nn.Conv1d(self.total_features, 64, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm1d(64)
self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm1d(128)
self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
self.bn3 = nn.BatchNorm1d(256)
# Calculate size after convolutions
conv_output_size = window_size * 256
# Fully connected layers
self.fc1 = nn.Linear(conv_output_size, 512)
self.fc2 = nn.Linear(512, 256)
# Advantage and Value streams (Dueling DQN architecture)
self.fc3 = nn.Linear(256, output_size) # Advantage stream
self.value_fc = nn.Linear(256, 1) # Value stream
# Initialize optimizer and scheduler
self.optimizer = optim.Adam(self.parameters(), lr=0.001)
@ -40,36 +60,6 @@ class CNNModelPyTorch(nn.Module):
# Move model to device
self.to(self.device)
def build_model(self):
"""Build the CNN architecture"""
# First Convolutional Layer
self.conv1 = nn.Conv1d(
in_channels=self.num_features * len(self.timeframes),
out_channels=32,
kernel_size=3,
padding=1
)
self.bn1 = nn.BatchNorm1d(32)
# Second Convolutional Layer
self.conv2 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm1d(64)
# Third Convolutional Layer
self.conv3 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
self.bn3 = nn.BatchNorm1d(128)
# Calculate size after convolutions
conv_out_size = self.window_size * 128
# Fully connected layers
self.fc1 = nn.Linear(conv_out_size, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, self.output_size)
# Additional output for value estimation
self.value_fc = nn.Linear(256, 1)
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""Forward pass through the network"""
# Ensure input is on the correct device

View File

@ -8,6 +8,7 @@ import os
import sys
import pandas as pd
import gym
import json
# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@ -28,19 +29,31 @@ logging.basicConfig(
]
)
class RLTradingEnvironment(TradingEnvironment):
"""Extended trading environment that reshapes state for CNN"""
def __init__(self, data, window_size, num_features, num_timeframes, **kwargs):
# Set attributes before parent initialization
class RLTradingEnvironment(gym.Env):
"""
Reinforcement Learning environment for trading with technical indicators
from multiple timeframes
"""
def __init__(self, features_1m, features_5m, features_15m, window_size=20, trading_fee=0.001):
super().__init__()
# Initialize attributes before parent class
self.window_size = window_size
self.num_features = num_features
self.num_timeframes = num_timeframes
self.feature_dim = num_features * num_timeframes
self.num_features = features_1m.shape[1] - 1 # Exclude close price
self.num_timeframes = 3 # 1m, 5m, 15m
self.feature_dim = self.num_features * self.num_timeframes
# Initialize parent class
super().__init__(data=data, **kwargs)
# Store features from different timeframes
self.features_1m = features_1m
self.features_5m = features_5m
self.features_15m = features_15m
# Update observation space for CNN
# Trading parameters
self.initial_balance = 1.0
self.trading_fee = trading_fee
# Define action and observation spaces
self.action_space = gym.spaces.Discrete(3) # 0: Buy, 1: Sell, 2: Hold
self.observation_space = gym.spaces.Box(
low=-np.inf,
high=np.inf,
@ -48,145 +61,354 @@ class RLTradingEnvironment(TradingEnvironment):
dtype=np.float32
)
# State variables
self.reset()
def reset(self):
"""Reset the environment to initial state"""
self.balance = self.initial_balance
self.position = 0.0 # Amount of asset held
self.current_step = self.window_size
self.trades = 0
self.wins = 0
self.losses = 0
self.trade_history = []
# Get initial observation
observation = self._get_observation()
return observation
def _get_observation(self):
"""Get current observation reshaped for CNN"""
# Get flattened observation from parent class
flat_obs = super()._get_observation()
"""
Get the current state observation.
Combine features from multiple timeframes, reshaped for the CNN.
"""
# Calculate indices for each timeframe
idx_1m = self.current_step
idx_5m = idx_1m // 5
idx_15m = idx_1m // 15
# Extract features (exclude close price)
features = flat_obs[:-1] # Remove close price
# Extract feature windows from each timeframe
window_1m = self.features_1m[idx_1m - self.window_size:idx_1m]
# Calculate number of complete windows
n_windows = len(features) // self.feature_dim
if n_windows < self.window_size:
# Pad with zeros if not enough data
padding = np.zeros((self.window_size - n_windows, self.feature_dim))
reshaped = np.vstack([
padding,
features[-(n_windows * self.feature_dim):].reshape(n_windows, self.feature_dim)
])
else:
# Take the most recent window_size windows
start_idx = (n_windows - self.window_size) * self.feature_dim
reshaped = features[start_idx:].reshape(self.window_size, self.feature_dim)
# Handle 5m timeframe
start_5m = max(0, idx_5m - self.window_size)
window_5m = self.features_5m[start_5m:idx_5m]
return reshaped.astype(np.float32)
# Handle 15m timeframe
start_15m = max(0, idx_15m - self.window_size)
window_15m = self.features_15m[start_15m:idx_15m]
def train_rl():
# Pad if needed (for 5m and 15m)
if len(window_5m) < self.window_size:
padding = np.zeros((self.window_size - len(window_5m), window_5m.shape[1]))
window_5m = np.vstack([padding, window_5m])
if len(window_15m) < self.window_size:
padding = np.zeros((self.window_size - len(window_15m), window_15m.shape[1]))
window_15m = np.vstack([padding, window_15m])
# Combine features from all timeframes
combined_features = np.hstack([
window_1m.reshape(self.window_size, -1),
window_5m.reshape(self.window_size, -1),
window_15m.reshape(self.window_size, -1)
])
# Convert to float32 and handle any NaN values
combined_features = np.nan_to_num(combined_features, nan=0.0).astype(np.float32)
return combined_features
def step(self, action):
"""
Take an action in the environment and return the next state, reward, done flag, and info
Args:
action (int): 0 = Buy, 1 = Sell, 2 = Hold
Returns:
tuple: (observation, reward, done, info)
"""
# Get current and next price
current_price = self.features_1m[self.current_step, -1] # Close price is last column
next_price = self.features_1m[self.current_step + 1, -1]
# Handle zero or negative prices
if current_price <= 0:
current_price = 1e-8 # Small positive number
if next_price <= 0:
next_price = current_price # Use current price if next price is invalid
price_change = (next_price - current_price) / current_price
# Default reward is slightly negative to discourage inaction
reward = -0.0001
done = False
# Execute action
if action == 0: # BUY
if self.position == 0: # Only buy if not already in position
self.position = self.balance * (1 - self.trading_fee)
self.balance = 0
self.trades += 1
reward = 0 # Neutral reward for entering position
self.trade_entry_price = current_price
elif action == 1: # SELL
if self.position > 0: # Only sell if in position
# Calculate position value at current price
position_value = self.position * (1 + price_change)
self.balance = position_value * (1 - self.trading_fee)
# Calculate profit/loss from trade
profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price
reward = profit_pct * 10 # Scale reward by profit percentage
# Update win/loss count
if profit_pct > 0:
self.wins += 1
else:
self.losses += 1
# Record trade
self.trade_history.append({
'entry_price': self.trade_entry_price,
'exit_price': next_price,
'profit_pct': profit_pct
})
# Reset position
self.position = 0
# else: (action == 2 - HOLD) - no position change
# Move to next step
self.current_step += 1
# Check if done
if self.current_step >= len(self.features_1m) - 1:
done = True
# Apply final evaluation
if self.position > 0:
# Force close position at the end
position_value = self.position * (1 + price_change)
self.balance = position_value * (1 - self.trading_fee)
profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price
reward += profit_pct * 10
# Update win/loss count
if profit_pct > 0:
self.wins += 1
else:
self.losses += 1
# Get the next observation
observation = self._get_observation()
# Calculate metrics for info
total_value = self.balance + self.position * next_price
gain = (total_value - self.initial_balance) / self.initial_balance
self.win_rate = self.wins / max(1, self.trades)
info = {
'balance': self.balance,
'position': self.position,
'total_value': total_value,
'gain': gain,
'trades': self.trades,
'win_rate': self.win_rate
}
return observation, reward, done, info
def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent"):
"""
Train the RL model using the DQN agent
Train DQN agent for RL-based trading with extended training and monitoring
"""
# Initialize data interface with BTC/USDT and multiple timeframes
timeframes = ['1m', '5m', '15m']
logger.info("Starting extended RL training for trading...")
# Environment setup
window_size = 20
data_interface = DataInterface(symbol="BTC/USDT", timeframes=timeframes)
timeframes = ["1m", "5m", "15m"]
trading_fee = 0.001
# Get training data
X_train, y_train, X_val, y_val, train_prices, val_prices = data_interface.prepare_training_data()
if X_train is None:
logger.error("Failed to get training data")
return
# Ensure save directory exists
os.makedirs(os.path.dirname(save_path), exist_ok=True)
# Calculate feature dimensions
num_features = X_train.shape[2] # Number of features per timeframe
total_features = num_features * len(timeframes) # Total features across all timeframes
# Setup TensorBoard for monitoring
writer = SummaryWriter(f'runs/rl_training_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
# Flatten features for environment
n_samples = X_train.shape[0]
flattened_features = X_train.reshape(n_samples, window_size, -1) # Reshape to (batch, window, features)
# Data loading
data_interface = DataInterface(
symbol="BTC/USDT",
timeframes=timeframes
)
# Create DataFrame with features as separate columns
feature_columns = [f'feature_{i}' for i in range(flattened_features.shape[2])]
df = pd.DataFrame(flattened_features.reshape(n_samples, -1), columns=feature_columns * window_size)
df['close'] = train_prices
# Get training data for each timeframe with more data
features_1m = data_interface.get_training_data("1m", n_candles=5000)
features_5m = data_interface.get_training_data("5m", n_candles=2500)
features_15m = data_interface.get_training_data("15m", n_candles=2500)
if features_1m is None or features_5m is None or features_15m is None:
logger.error("Failed to load training data")
return None
# Convert DataFrames to numpy arrays, excluding timestamp column
features_1m = features_1m.drop('timestamp', axis=1, errors='ignore').values
features_5m = features_5m.drop('timestamp', axis=1, errors='ignore').values
features_15m = features_15m.drop('timestamp', axis=1, errors='ignore').values
# Calculate number of features per timeframe
num_features = features_1m.shape[1] # Number of features after dropping timestamp
# Create environment
env = RLTradingEnvironment(
data=df,
features_1m=features_1m,
features_5m=features_5m,
features_15m=features_15m,
window_size=window_size,
num_features=num_features,
num_timeframes=len(timeframes),
initial_balance=10000,
fee_rate=0.001,
max_steps=1000
trading_fee=trading_fee
)
# Create DQN agent
# Create agent with adjusted parameters for longer training
state_size = window_size
action_size = 3
agent = DQNAgent(
state_size=window_size, # First dimension of observation space
action_size=env.action_space.n,
state_size=state_size,
action_size=action_size,
window_size=window_size,
num_features=num_features,
timeframes=timeframes,
learning_rate=0.001,
gamma=0.99,
learning_rate=0.0005, # Reduced learning rate for stability
gamma=0.99, # Increased discount factor
epsilon=1.0,
epsilon_min=0.01,
epsilon_decay=0.995,
memory_size=10000,
batch_size=32,
target_update=10
epsilon_decay=0.999, # Slower epsilon decay
memory_size=50000, # Increased memory size
batch_size=128 # Increased batch size
)
# Training parameters
episodes = 1000
max_steps = 1000
# Variables to track best performance
best_reward = float('-inf')
best_model_path = 'NN/models/saved/best_rl_model.pth'
best_episode = 0
best_pnl = float('-inf')
best_win_rate = 0.0
# Create models directory if it doesn't exist
os.makedirs(os.path.dirname(best_model_path), exist_ok=True)
# Training metrics
episode_rewards = []
episode_pnls = []
episode_win_rates = []
episode_trades = []
# Check if previous best model exists and load it
best_model_path = f"{save_path}_best"
if os.path.exists(f"{best_model_path}_policy.pt"):
try:
logger.info(f"Loading previous best model from {best_model_path}")
agent.load(best_model_path)
metadata_path = f"{best_model_path}_metadata.json"
if os.path.exists(metadata_path):
with open(metadata_path, 'r') as f:
metadata = json.load(f)
best_reward = metadata.get('best_reward', best_reward)
best_episode = metadata.get('best_episode', best_episode)
best_pnl = metadata.get('best_pnl', best_pnl)
best_win_rate = metadata.get('best_win_rate', best_win_rate)
logger.info(f"Loaded previous best metrics - Reward: {best_reward:.4f}, PnL: {best_pnl:.4f}, Win Rate: {best_win_rate:.4f}")
except Exception as e:
logger.error(f"Error loading previous best model: {e}")
# Training loop
for episode in range(episodes):
state = env.reset()
total_reward = 0
try:
for episode in range(1, num_episodes + 1):
state = env.reset()
total_reward = 0
done = False
steps = 0
for step in range(max_steps):
# Get action from agent
action = agent.act(state)
while not done and steps < max_steps:
action = agent.act(state)
next_state, reward, done, info = env.step(action)
agent.remember(state, action, reward, next_state, done)
# Take action in environment
next_state, reward, done, info = env.step(action)
# Store experience in agent's memory
agent.remember(state, action, reward, next_state, done)
# Train agent
if len(agent.memory) > agent.batch_size:
# Learn from experience
loss = agent.replay()
if loss is not None:
logger.debug(f"Training loss: {loss:.4f}")
# Update state and reward
state = next_state
total_reward += reward
state = next_state
total_reward += reward
steps += 1
if done:
break
# Calculate episode metrics
episode_rewards.append(total_reward)
episode_pnls.append(info['gain'])
episode_win_rates.append(info['win_rate'])
episode_trades.append(info['trades'])
# Update epsilon
agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay)
# Log to TensorBoard
writer.add_scalar('Reward/episode', total_reward, episode)
writer.add_scalar('PnL/episode', info['gain'], episode)
writer.add_scalar('WinRate/episode', info['win_rate'], episode)
writer.add_scalar('Trades/episode', info['trades'], episode)
writer.add_scalar('Epsilon/episode', agent.epsilon, episode)
# Log episode results
logger.info(f"Episode: {episode + 1}/{episodes}")
logger.info(f"Total Reward: {total_reward:.2f}")
logger.info(f"Final Balance: {info['balance']:.2f}")
logger.info(f"Max Drawdown: {info['max_drawdown']:.2%}")
logger.info(f"Win Rate: {info['win_rate']:.2%}")
logger.info(f"Epsilon: {agent.epsilon:.4f}")
# Save the best model based on multiple metrics (only every 50 episodes)
is_better = False
if episode % 50 == 0: # Only check for saving every 50 episodes
if (info['gain'] > best_pnl and info['win_rate'] > 0.5) or \
(info['gain'] > best_pnl * 1.1) or \
(info['win_rate'] > best_win_rate * 1.1):
best_reward = total_reward
best_episode = episode
best_pnl = info['gain']
best_win_rate = info['win_rate']
agent.save(best_model_path)
is_better = True
# Save best model
if total_reward > best_reward:
best_reward = total_reward
agent.save(best_model_path)
logger.info(f"New best model saved with reward: {best_reward:.2f}")
# Save metadata about the best model
metadata = {
'best_reward': best_reward,
'best_episode': best_episode,
'best_pnl': best_pnl,
'best_win_rate': best_win_rate,
'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
with open(f"{best_model_path}_metadata.json", 'w') as f:
json.dump(metadata, f)
# Save checkpoint every 100 episodes
if (episode + 1) % 100 == 0:
checkpoint_path = f'NN/models/saved/rl_model_episode_{episode + 1}.pth'
agent.save(checkpoint_path)
logger.info(f"Checkpoint saved at episode {episode + 1}")
# Log training progress
if episode % 10 == 0:
avg_reward = sum(episode_rewards[-10:]) / 10
avg_pnl = sum(episode_pnls[-10:]) / 10
avg_win_rate = sum(episode_win_rates[-10:]) / 10
avg_trades = sum(episode_trades[-10:]) / 10
status = "NEW BEST!" if is_better else ""
logger.info(f"Episode {episode}/{num_episodes} {status}")
logger.info(f"Metrics (last 10 episodes):")
logger.info(f" Reward: {avg_reward:.4f}")
logger.info(f" PnL: {avg_pnl:.4f}")
logger.info(f" Win Rate: {avg_win_rate:.4f}")
logger.info(f" Trades: {avg_trades:.2f}")
logger.info(f" Epsilon: {agent.epsilon:.4f}")
logger.info(f"Best so far - PnL: {best_pnl:.4f}, Win Rate: {best_win_rate:.4f}")
except KeyboardInterrupt:
logger.info("Training interrupted by user. Saving best model...")
# Close TensorBoard writer
writer.close()
# Final logs
logger.info(f"Training completed. Best model from episode {best_episode}")
logger.info(f"Best metrics:")
logger.info(f" Reward: {best_reward:.4f}")
logger.info(f" PnL: {best_pnl:.4f}")
logger.info(f" Win Rate: {best_win_rate:.4f}")
# Return the agent for potential further use
return agent
if __name__ == "__main__":
train_rl()

View File

@ -303,6 +303,76 @@ class DataInterface:
"""
return len(self.timeframes) * 5 # OHLCV features for each timeframe
def get_features(self, timeframe, n_candles=1000):
"""
Get feature data with technical indicators for a specific timeframe.
Args:
timeframe (str): Timeframe to get features for ('1m', '5m', etc.)
n_candles (int): Number of candles to get
Returns:
np.ndarray: Array of feature data including technical indicators
and the close price as the last column
"""
# Get historical data
df = self.get_historical_data(timeframe=timeframe, n_candles=n_candles)
if df is None or df.empty:
logger.error(f"No data available for {self.symbol} {timeframe}")
return None
# Add technical indicators
df = self.add_technical_indicators(df)
# Drop NaN values that might have been introduced by indicators
df = df.dropna()
# Extract features (all columns except timestamp)
features = df.drop('timestamp', axis=1).values
logger.info(f"Prepared {len(features)} {timeframe} feature rows with {features.shape[1]} features")
return features
def add_technical_indicators(self, df):
"""
Add technical indicators to the dataframe.
Args:
df (pd.DataFrame): DataFrame with OHLCV data
Returns:
pd.DataFrame: DataFrame with added technical indicators
"""
# Make a copy to avoid modifying the original
df_copy = df.copy()
# Basic price indicators
df_copy['returns'] = df_copy['close'].pct_change()
df_copy['log_returns'] = np.log(df_copy['close']/df_copy['close'].shift(1))
# Moving Averages
df_copy['sma_7'] = ta.trend.sma_indicator(df_copy['close'], window=7)
df_copy['sma_25'] = ta.trend.sma_indicator(df_copy['close'], window=25)
df_copy['sma_99'] = ta.trend.sma_indicator(df_copy['close'], window=99)
# MACD
macd = ta.trend.MACD(df_copy['close'])
df_copy['macd'] = macd.macd()
df_copy['macd_signal'] = macd.macd_signal()
df_copy['macd_diff'] = macd.macd_diff()
# RSI
df_copy['rsi'] = ta.momentum.rsi(df_copy['close'], window=14)
# Bollinger Bands
bollinger = ta.volatility.BollingerBands(df_copy['close'])
df_copy['bb_high'] = bollinger.bollinger_hband()
df_copy['bb_low'] = bollinger.bollinger_lband()
df_copy['bb_pct'] = bollinger.bollinger_pband()
return df_copy
def calculate_pnl(self, predictions, actual_prices, position_size=1.0):
"""
Robust PnL calculator that handles:
@ -557,33 +627,33 @@ class DataInterface:
# Calculate technical indicators
try:
# Add RSI (14)
df['rsi'] = ta.rsi(df['close'], length=14)
df['rsi'] = ta.momentum.rsi(df['close'], window=14)
# Add MACD
macd = ta.macd(df['close'])
df['macd'] = macd['MACD_12_26_9']
df['macd_signal'] = macd['MACDs_12_26_9']
df['macd_hist'] = macd['MACDh_12_26_9']
macd = ta.trend.MACD(df['close'])
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
df['macd_hist'] = macd.macd_diff()
# Add Bollinger Bands
bbands = ta.bbands(df['close'], length=20)
df['bb_upper'] = bbands['BBU_20_2.0']
df['bb_middle'] = bbands['BBM_20_2.0']
df['bb_lower'] = bbands['BBL_20_2.0']
bbands = ta.volatility.BollingerBands(df['close'])
df['bb_upper'] = bbands.bollinger_hband()
df['bb_middle'] = bbands.bollinger_mavg()
df['bb_lower'] = bbands.bollinger_lband()
# Add ATR (Average True Range)
df['atr'] = ta.atr(df['high'], df['low'], df['close'], length=14)
df['atr'] = ta.volatility.average_true_range(df['high'], df['low'], df['close'], window=14)
# Add moving averages
df['sma_20'] = ta.sma(df['close'], length=20)
df['sma_50'] = ta.sma(df['close'], length=50)
df['ema_20'] = ta.ema(df['close'], length=20)
df['sma_20'] = ta.trend.sma_indicator(df['close'], window=20)
df['sma_50'] = ta.trend.sma_indicator(df['close'], window=50)
df['ema_20'] = ta.trend.ema_indicator(df['close'], window=20)
# Add OBV (On-Balance Volume)
df['obv'] = ta.obv(df['close'], df['volume'])
df['obv'] = ta.volume.on_balance_volume(df['close'], df['volume'])
# Add momentum indicators
df['mom'] = ta.mom(df['close'], length=10)
df['mom'] = ta.momentum.roc(df['close'], window=10)
# Normalize price to previous close
df['close_norm'] = df['close'] / df['close'].shift(1) - 1